DuckyDuck123 commited on
Commit
fdbfa85
·
verified ·
1 Parent(s): 1f8a38d

Upload folder using huggingface_hub

Browse files
checkpoint-1563/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 128,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 2,
21
+ "num_hidden_layers": 2,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
checkpoint-1563/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21739cbb462f3171b180a97a73ab2d20e5a11411b00bc6b500dae999c83f1e6
3
+ size 17549304
checkpoint-1563/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a256413bfa7f410b39b9495f90fafd978f30ad1026d358d70d6b8c5b59a6d8bd
3
+ size 35124939
checkpoint-1563/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecc51cab2a0807d9bab78751494474272ab74c5227da8b3cc7f8fd53049599e
3
+ size 14645
checkpoint-1563/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b066d688f632c8d3d7598b3c6230abba0dd07074b69c51c980fad111fb859277
3
+ size 1465
checkpoint-1563/trainer_state.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1563,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06397952655150352,
14
+ "grad_norm": 2.6240322589874268,
15
+ "learning_rate": 1.974664107485605e-05,
16
+ "loss": 0.692889175415039,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.12795905310300704,
21
+ "grad_norm": 1.6500952243804932,
22
+ "learning_rate": 1.9490722968650032e-05,
23
+ "loss": 0.6690621948242188,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.19193857965451055,
28
+ "grad_norm": 5.539139747619629,
29
+ "learning_rate": 1.923480486244402e-05,
30
+ "loss": 0.6443107604980469,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.2559181062060141,
35
+ "grad_norm": 9.561639785766602,
36
+ "learning_rate": 1.8978886756238006e-05,
37
+ "loss": 0.6174827194213868,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.3198976327575176,
42
+ "grad_norm": 3.9405109882354736,
43
+ "learning_rate": 1.8722968650031992e-05,
44
+ "loss": 0.603485221862793,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.3838771593090211,
49
+ "grad_norm": 5.654637336730957,
50
+ "learning_rate": 1.846705054382598e-05,
51
+ "loss": 0.5690624237060546,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.44785668586052463,
56
+ "grad_norm": 6.288730144500732,
57
+ "learning_rate": 1.8211132437619962e-05,
58
+ "loss": 0.5585448455810547,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.5118362124120281,
63
+ "grad_norm": 7.303677558898926,
64
+ "learning_rate": 1.795521433141395e-05,
65
+ "loss": 0.5453743362426757,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.5758157389635317,
70
+ "grad_norm": 4.167948246002197,
71
+ "learning_rate": 1.7699296225207935e-05,
72
+ "loss": 0.5284878540039063,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.6397952655150352,
77
+ "grad_norm": 6.335079669952393,
78
+ "learning_rate": 1.744337811900192e-05,
79
+ "loss": 0.497203369140625,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.7037747920665387,
84
+ "grad_norm": 9.398835182189941,
85
+ "learning_rate": 1.7187460012795908e-05,
86
+ "loss": 0.4662479782104492,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.7677543186180422,
91
+ "grad_norm": 17.086259841918945,
92
+ "learning_rate": 1.693154190658989e-05,
93
+ "loss": 0.4858377456665039,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.8317338451695457,
98
+ "grad_norm": 11.53818416595459,
99
+ "learning_rate": 1.6675623800383878e-05,
100
+ "loss": 0.45729072570800783,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.8957133717210493,
105
+ "grad_norm": 22.93279457092285,
106
+ "learning_rate": 1.6419705694177864e-05,
107
+ "loss": 0.45677249908447265,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.9596928982725528,
112
+ "grad_norm": 15.763897895812988,
113
+ "learning_rate": 1.616378758797185e-05,
114
+ "loss": 0.4614885711669922,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.0,
119
+ "eval_accuracy": 0.81124,
120
+ "eval_f1": 0.8026596411993476,
121
+ "eval_loss": 0.4272981286048889,
122
+ "eval_precision": 0.8408832033645842,
123
+ "eval_recall": 0.76776,
124
+ "eval_runtime": 16.5749,
125
+ "eval_samples_per_second": 1508.308,
126
+ "eval_steps_per_second": 94.299,
127
+ "step": 1563
128
+ }
129
+ ],
130
+ "logging_steps": 100,
131
+ "max_steps": 7815,
132
+ "num_input_tokens_seen": 0,
133
+ "num_train_epochs": 5,
134
+ "save_steps": 500,
135
+ "stateful_callbacks": {
136
+ "TrainerControl": {
137
+ "args": {
138
+ "should_epoch_stop": false,
139
+ "should_evaluate": false,
140
+ "should_log": false,
141
+ "should_save": true,
142
+ "should_training_stop": false
143
+ },
144
+ "attributes": {}
145
+ }
146
+ },
147
+ "total_flos": 15881088000000.0,
148
+ "train_batch_size": 16,
149
+ "trial_name": null,
150
+ "trial_params": null
151
+ }
checkpoint-1563/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201
checkpoint-3126/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 128,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 2,
21
+ "num_hidden_layers": 2,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
checkpoint-3126/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8065680bcd8ce183b9a98d08c8fc08cd68e1663654b8b7e61d21999d4a0aa756
3
+ size 17549304
checkpoint-3126/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b45ba52cc295e984559e3f471eff90e1a79d953bf4cf8d5279d3c7d1a5ec3a
3
+ size 35124939
checkpoint-3126/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:938bd0d5986458918e2977526a8cbcb22bbf3e51d0d2fce4d8466e00816238c6
3
+ size 14645
checkpoint-3126/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3467314533cca9833232039cb4699563afa5305362ec3f4f90b43047b774ea2
3
+ size 1465
checkpoint-3126/trainer_state.json ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3126,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06397952655150352,
14
+ "grad_norm": 2.6240322589874268,
15
+ "learning_rate": 1.974664107485605e-05,
16
+ "loss": 0.692889175415039,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.12795905310300704,
21
+ "grad_norm": 1.6500952243804932,
22
+ "learning_rate": 1.9490722968650032e-05,
23
+ "loss": 0.6690621948242188,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.19193857965451055,
28
+ "grad_norm": 5.539139747619629,
29
+ "learning_rate": 1.923480486244402e-05,
30
+ "loss": 0.6443107604980469,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.2559181062060141,
35
+ "grad_norm": 9.561639785766602,
36
+ "learning_rate": 1.8978886756238006e-05,
37
+ "loss": 0.6174827194213868,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.3198976327575176,
42
+ "grad_norm": 3.9405109882354736,
43
+ "learning_rate": 1.8722968650031992e-05,
44
+ "loss": 0.603485221862793,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.3838771593090211,
49
+ "grad_norm": 5.654637336730957,
50
+ "learning_rate": 1.846705054382598e-05,
51
+ "loss": 0.5690624237060546,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.44785668586052463,
56
+ "grad_norm": 6.288730144500732,
57
+ "learning_rate": 1.8211132437619962e-05,
58
+ "loss": 0.5585448455810547,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.5118362124120281,
63
+ "grad_norm": 7.303677558898926,
64
+ "learning_rate": 1.795521433141395e-05,
65
+ "loss": 0.5453743362426757,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.5758157389635317,
70
+ "grad_norm": 4.167948246002197,
71
+ "learning_rate": 1.7699296225207935e-05,
72
+ "loss": 0.5284878540039063,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.6397952655150352,
77
+ "grad_norm": 6.335079669952393,
78
+ "learning_rate": 1.744337811900192e-05,
79
+ "loss": 0.497203369140625,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.7037747920665387,
84
+ "grad_norm": 9.398835182189941,
85
+ "learning_rate": 1.7187460012795908e-05,
86
+ "loss": 0.4662479782104492,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.7677543186180422,
91
+ "grad_norm": 17.086259841918945,
92
+ "learning_rate": 1.693154190658989e-05,
93
+ "loss": 0.4858377456665039,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.8317338451695457,
98
+ "grad_norm": 11.53818416595459,
99
+ "learning_rate": 1.6675623800383878e-05,
100
+ "loss": 0.45729072570800783,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.8957133717210493,
105
+ "grad_norm": 22.93279457092285,
106
+ "learning_rate": 1.6419705694177864e-05,
107
+ "loss": 0.45677249908447265,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.9596928982725528,
112
+ "grad_norm": 15.763897895812988,
113
+ "learning_rate": 1.616378758797185e-05,
114
+ "loss": 0.4614885711669922,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.0,
119
+ "eval_accuracy": 0.81124,
120
+ "eval_f1": 0.8026596411993476,
121
+ "eval_loss": 0.4272981286048889,
122
+ "eval_precision": 0.8408832033645842,
123
+ "eval_recall": 0.76776,
124
+ "eval_runtime": 16.5749,
125
+ "eval_samples_per_second": 1508.308,
126
+ "eval_steps_per_second": 94.299,
127
+ "step": 1563
128
+ },
129
+ {
130
+ "epoch": 1.0236724248240563,
131
+ "grad_norm": 16.664216995239258,
132
+ "learning_rate": 1.5907869481765838e-05,
133
+ "loss": 0.44625389099121093,
134
+ "step": 1600
135
+ },
136
+ {
137
+ "epoch": 1.0876519513755598,
138
+ "grad_norm": 23.07805824279785,
139
+ "learning_rate": 1.565195137555982e-05,
140
+ "loss": 0.44664024353027343,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 1.1516314779270633,
145
+ "grad_norm": 8.32645320892334,
146
+ "learning_rate": 1.5396033269353807e-05,
147
+ "loss": 0.42939315795898436,
148
+ "step": 1800
149
+ },
150
+ {
151
+ "epoch": 1.2156110044785668,
152
+ "grad_norm": 7.903297424316406,
153
+ "learning_rate": 1.5140115163147796e-05,
154
+ "loss": 0.41601951599121095,
155
+ "step": 1900
156
+ },
157
+ {
158
+ "epoch": 1.2795905310300704,
159
+ "grad_norm": 11.639277458190918,
160
+ "learning_rate": 1.488419705694178e-05,
161
+ "loss": 0.39629173278808594,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 1.3435700575815739,
166
+ "grad_norm": 6.199832916259766,
167
+ "learning_rate": 1.4628278950735765e-05,
168
+ "loss": 0.3827814102172852,
169
+ "step": 2100
170
+ },
171
+ {
172
+ "epoch": 1.4075495841330774,
173
+ "grad_norm": 17.21854019165039,
174
+ "learning_rate": 1.4372360844529752e-05,
175
+ "loss": 0.41142051696777343,
176
+ "step": 2200
177
+ },
178
+ {
179
+ "epoch": 1.471529110684581,
180
+ "grad_norm": 15.716930389404297,
181
+ "learning_rate": 1.4116442738323737e-05,
182
+ "loss": 0.38613304138183596,
183
+ "step": 2300
184
+ },
185
+ {
186
+ "epoch": 1.5355086372360844,
187
+ "grad_norm": 6.01155948638916,
188
+ "learning_rate": 1.3860524632117725e-05,
189
+ "loss": 0.41208648681640625,
190
+ "step": 2400
191
+ },
192
+ {
193
+ "epoch": 1.599488163787588,
194
+ "grad_norm": 20.55832862854004,
195
+ "learning_rate": 1.360460652591171e-05,
196
+ "loss": 0.38446582794189454,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 1.6634676903390915,
201
+ "grad_norm": 11.465950012207031,
202
+ "learning_rate": 1.3348688419705695e-05,
203
+ "loss": 0.37646575927734377,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 1.727447216890595,
208
+ "grad_norm": 3.505220651626587,
209
+ "learning_rate": 1.3092770313499681e-05,
210
+ "loss": 0.3866144561767578,
211
+ "step": 2700
212
+ },
213
+ {
214
+ "epoch": 1.7914267434420985,
215
+ "grad_norm": 27.13107681274414,
216
+ "learning_rate": 1.2836852207293666e-05,
217
+ "loss": 0.4025085830688477,
218
+ "step": 2800
219
+ },
220
+ {
221
+ "epoch": 1.855406269993602,
222
+ "grad_norm": 7.345473766326904,
223
+ "learning_rate": 1.2580934101087654e-05,
224
+ "loss": 0.3890159225463867,
225
+ "step": 2900
226
+ },
227
+ {
228
+ "epoch": 1.9193857965451055,
229
+ "grad_norm": 24.46322250366211,
230
+ "learning_rate": 1.232501599488164e-05,
231
+ "loss": 0.381514778137207,
232
+ "step": 3000
233
+ },
234
+ {
235
+ "epoch": 1.983365323096609,
236
+ "grad_norm": 6.782553195953369,
237
+ "learning_rate": 1.2069097888675624e-05,
238
+ "loss": 0.3956758499145508,
239
+ "step": 3100
240
+ },
241
+ {
242
+ "epoch": 2.0,
243
+ "eval_accuracy": 0.8286,
244
+ "eval_f1": 0.8181779607077694,
245
+ "eval_loss": 0.3925381898880005,
246
+ "eval_precision": 0.8711484593837535,
247
+ "eval_recall": 0.77128,
248
+ "eval_runtime": 18.1817,
249
+ "eval_samples_per_second": 1375.01,
250
+ "eval_steps_per_second": 85.966,
251
+ "step": 3126
252
+ }
253
+ ],
254
+ "logging_steps": 100,
255
+ "max_steps": 7815,
256
+ "num_input_tokens_seen": 0,
257
+ "num_train_epochs": 5,
258
+ "save_steps": 500,
259
+ "stateful_callbacks": {
260
+ "TrainerControl": {
261
+ "args": {
262
+ "should_epoch_stop": false,
263
+ "should_evaluate": false,
264
+ "should_log": false,
265
+ "should_save": true,
266
+ "should_training_stop": false
267
+ },
268
+ "attributes": {}
269
+ }
270
+ },
271
+ "total_flos": 31762176000000.0,
272
+ "train_batch_size": 16,
273
+ "trial_name": null,
274
+ "trial_params": null
275
+ }
checkpoint-3126/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201
checkpoint-4689/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 128,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 2,
21
+ "num_hidden_layers": 2,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
checkpoint-4689/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850cb5c7a7a3911c442eb220b5aa09f555940c165c58d2014f6a3759e9daefad
3
+ size 17549304
checkpoint-4689/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0036c06d6d2680c7819f2f628179bf43be254bd9de48ef5dbd585ee63fd194f8
3
+ size 35124939
checkpoint-4689/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da02765be33e2403a5cc456e72a86b6880abe7188c7c1cc9cc9ac7f65934c4d2
3
+ size 14645
checkpoint-4689/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3f4700fb7d477d8c66661bf4df5eb8b880b1a4e61cf4a5bff3a626b214accd6
3
+ size 1465
checkpoint-4689/trainer_state.json ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4689,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06397952655150352,
14
+ "grad_norm": 2.6240322589874268,
15
+ "learning_rate": 1.974664107485605e-05,
16
+ "loss": 0.692889175415039,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.12795905310300704,
21
+ "grad_norm": 1.6500952243804932,
22
+ "learning_rate": 1.9490722968650032e-05,
23
+ "loss": 0.6690621948242188,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.19193857965451055,
28
+ "grad_norm": 5.539139747619629,
29
+ "learning_rate": 1.923480486244402e-05,
30
+ "loss": 0.6443107604980469,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.2559181062060141,
35
+ "grad_norm": 9.561639785766602,
36
+ "learning_rate": 1.8978886756238006e-05,
37
+ "loss": 0.6174827194213868,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.3198976327575176,
42
+ "grad_norm": 3.9405109882354736,
43
+ "learning_rate": 1.8722968650031992e-05,
44
+ "loss": 0.603485221862793,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.3838771593090211,
49
+ "grad_norm": 5.654637336730957,
50
+ "learning_rate": 1.846705054382598e-05,
51
+ "loss": 0.5690624237060546,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.44785668586052463,
56
+ "grad_norm": 6.288730144500732,
57
+ "learning_rate": 1.8211132437619962e-05,
58
+ "loss": 0.5585448455810547,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.5118362124120281,
63
+ "grad_norm": 7.303677558898926,
64
+ "learning_rate": 1.795521433141395e-05,
65
+ "loss": 0.5453743362426757,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.5758157389635317,
70
+ "grad_norm": 4.167948246002197,
71
+ "learning_rate": 1.7699296225207935e-05,
72
+ "loss": 0.5284878540039063,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.6397952655150352,
77
+ "grad_norm": 6.335079669952393,
78
+ "learning_rate": 1.744337811900192e-05,
79
+ "loss": 0.497203369140625,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.7037747920665387,
84
+ "grad_norm": 9.398835182189941,
85
+ "learning_rate": 1.7187460012795908e-05,
86
+ "loss": 0.4662479782104492,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.7677543186180422,
91
+ "grad_norm": 17.086259841918945,
92
+ "learning_rate": 1.693154190658989e-05,
93
+ "loss": 0.4858377456665039,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.8317338451695457,
98
+ "grad_norm": 11.53818416595459,
99
+ "learning_rate": 1.6675623800383878e-05,
100
+ "loss": 0.45729072570800783,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.8957133717210493,
105
+ "grad_norm": 22.93279457092285,
106
+ "learning_rate": 1.6419705694177864e-05,
107
+ "loss": 0.45677249908447265,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.9596928982725528,
112
+ "grad_norm": 15.763897895812988,
113
+ "learning_rate": 1.616378758797185e-05,
114
+ "loss": 0.4614885711669922,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.0,
119
+ "eval_accuracy": 0.81124,
120
+ "eval_f1": 0.8026596411993476,
121
+ "eval_loss": 0.4272981286048889,
122
+ "eval_precision": 0.8408832033645842,
123
+ "eval_recall": 0.76776,
124
+ "eval_runtime": 16.5749,
125
+ "eval_samples_per_second": 1508.308,
126
+ "eval_steps_per_second": 94.299,
127
+ "step": 1563
128
+ },
129
+ {
130
+ "epoch": 1.0236724248240563,
131
+ "grad_norm": 16.664216995239258,
132
+ "learning_rate": 1.5907869481765838e-05,
133
+ "loss": 0.44625389099121093,
134
+ "step": 1600
135
+ },
136
+ {
137
+ "epoch": 1.0876519513755598,
138
+ "grad_norm": 23.07805824279785,
139
+ "learning_rate": 1.565195137555982e-05,
140
+ "loss": 0.44664024353027343,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 1.1516314779270633,
145
+ "grad_norm": 8.32645320892334,
146
+ "learning_rate": 1.5396033269353807e-05,
147
+ "loss": 0.42939315795898436,
148
+ "step": 1800
149
+ },
150
+ {
151
+ "epoch": 1.2156110044785668,
152
+ "grad_norm": 7.903297424316406,
153
+ "learning_rate": 1.5140115163147796e-05,
154
+ "loss": 0.41601951599121095,
155
+ "step": 1900
156
+ },
157
+ {
158
+ "epoch": 1.2795905310300704,
159
+ "grad_norm": 11.639277458190918,
160
+ "learning_rate": 1.488419705694178e-05,
161
+ "loss": 0.39629173278808594,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 1.3435700575815739,
166
+ "grad_norm": 6.199832916259766,
167
+ "learning_rate": 1.4628278950735765e-05,
168
+ "loss": 0.3827814102172852,
169
+ "step": 2100
170
+ },
171
+ {
172
+ "epoch": 1.4075495841330774,
173
+ "grad_norm": 17.21854019165039,
174
+ "learning_rate": 1.4372360844529752e-05,
175
+ "loss": 0.41142051696777343,
176
+ "step": 2200
177
+ },
178
+ {
179
+ "epoch": 1.471529110684581,
180
+ "grad_norm": 15.716930389404297,
181
+ "learning_rate": 1.4116442738323737e-05,
182
+ "loss": 0.38613304138183596,
183
+ "step": 2300
184
+ },
185
+ {
186
+ "epoch": 1.5355086372360844,
187
+ "grad_norm": 6.01155948638916,
188
+ "learning_rate": 1.3860524632117725e-05,
189
+ "loss": 0.41208648681640625,
190
+ "step": 2400
191
+ },
192
+ {
193
+ "epoch": 1.599488163787588,
194
+ "grad_norm": 20.55832862854004,
195
+ "learning_rate": 1.360460652591171e-05,
196
+ "loss": 0.38446582794189454,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 1.6634676903390915,
201
+ "grad_norm": 11.465950012207031,
202
+ "learning_rate": 1.3348688419705695e-05,
203
+ "loss": 0.37646575927734377,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 1.727447216890595,
208
+ "grad_norm": 3.505220651626587,
209
+ "learning_rate": 1.3092770313499681e-05,
210
+ "loss": 0.3866144561767578,
211
+ "step": 2700
212
+ },
213
+ {
214
+ "epoch": 1.7914267434420985,
215
+ "grad_norm": 27.13107681274414,
216
+ "learning_rate": 1.2836852207293666e-05,
217
+ "loss": 0.4025085830688477,
218
+ "step": 2800
219
+ },
220
+ {
221
+ "epoch": 1.855406269993602,
222
+ "grad_norm": 7.345473766326904,
223
+ "learning_rate": 1.2580934101087654e-05,
224
+ "loss": 0.3890159225463867,
225
+ "step": 2900
226
+ },
227
+ {
228
+ "epoch": 1.9193857965451055,
229
+ "grad_norm": 24.46322250366211,
230
+ "learning_rate": 1.232501599488164e-05,
231
+ "loss": 0.381514778137207,
232
+ "step": 3000
233
+ },
234
+ {
235
+ "epoch": 1.983365323096609,
236
+ "grad_norm": 6.782553195953369,
237
+ "learning_rate": 1.2069097888675624e-05,
238
+ "loss": 0.3956758499145508,
239
+ "step": 3100
240
+ },
241
+ {
242
+ "epoch": 2.0,
243
+ "eval_accuracy": 0.8286,
244
+ "eval_f1": 0.8181779607077694,
245
+ "eval_loss": 0.3925381898880005,
246
+ "eval_precision": 0.8711484593837535,
247
+ "eval_recall": 0.77128,
248
+ "eval_runtime": 18.1817,
249
+ "eval_samples_per_second": 1375.01,
250
+ "eval_steps_per_second": 85.966,
251
+ "step": 3126
252
+ },
253
+ {
254
+ "epoch": 2.0473448496481126,
255
+ "grad_norm": 11.411704063415527,
256
+ "learning_rate": 1.181317978246961e-05,
257
+ "loss": 0.36985355377197265,
258
+ "step": 3200
259
+ },
260
+ {
261
+ "epoch": 2.111324376199616,
262
+ "grad_norm": 8.180529594421387,
263
+ "learning_rate": 1.1557261676263596e-05,
264
+ "loss": 0.34660816192626953,
265
+ "step": 3300
266
+ },
267
+ {
268
+ "epoch": 2.1753039027511196,
269
+ "grad_norm": 14.788076400756836,
270
+ "learning_rate": 1.1301343570057584e-05,
271
+ "loss": 0.3805205154418945,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 2.239283429302623,
276
+ "grad_norm": 38.90132522583008,
277
+ "learning_rate": 1.1045425463851569e-05,
278
+ "loss": 0.36612781524658206,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 2.3032629558541267,
283
+ "grad_norm": 6.464240550994873,
284
+ "learning_rate": 1.0789507357645555e-05,
285
+ "loss": 0.3882516098022461,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 2.36724248240563,
290
+ "grad_norm": 12.745105743408203,
291
+ "learning_rate": 1.053358925143954e-05,
292
+ "loss": 0.37442840576171876,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 2.4312220089571337,
297
+ "grad_norm": 18.671857833862305,
298
+ "learning_rate": 1.0277671145233525e-05,
299
+ "loss": 0.3458353424072266,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 2.495201535508637,
304
+ "grad_norm": 14.760592460632324,
305
+ "learning_rate": 1.0021753039027513e-05,
306
+ "loss": 0.3635056686401367,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 2.5591810620601407,
311
+ "grad_norm": 3.513972282409668,
312
+ "learning_rate": 9.765834932821498e-06,
313
+ "loss": 0.33845436096191406,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 2.6231605886116443,
318
+ "grad_norm": 7.455184459686279,
319
+ "learning_rate": 9.509916826615485e-06,
320
+ "loss": 0.36538402557373045,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 2.6871401151631478,
325
+ "grad_norm": 35.58430862426758,
326
+ "learning_rate": 9.25399872040947e-06,
327
+ "loss": 0.38003883361816404,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 2.7511196417146513,
332
+ "grad_norm": 28.353757858276367,
333
+ "learning_rate": 8.998080614203456e-06,
334
+ "loss": 0.334156379699707,
335
+ "step": 4300
336
+ },
337
+ {
338
+ "epoch": 2.815099168266155,
339
+ "grad_norm": 10.301733016967773,
340
+ "learning_rate": 8.742162507997441e-06,
341
+ "loss": 0.36084671020507814,
342
+ "step": 4400
343
+ },
344
+ {
345
+ "epoch": 2.8790786948176583,
346
+ "grad_norm": 8.111748695373535,
347
+ "learning_rate": 8.486244401791428e-06,
348
+ "loss": 0.3733618545532227,
349
+ "step": 4500
350
+ },
351
+ {
352
+ "epoch": 2.943058221369162,
353
+ "grad_norm": 7.716980934143066,
354
+ "learning_rate": 8.230326295585414e-06,
355
+ "loss": 0.37022560119628906,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 3.0,
360
+ "eval_accuracy": 0.8426,
361
+ "eval_f1": 0.8457286235151135,
362
+ "eval_loss": 0.36446496844291687,
363
+ "eval_precision": 0.8292457907280695,
364
+ "eval_recall": 0.86288,
365
+ "eval_runtime": 13.8616,
366
+ "eval_samples_per_second": 1803.544,
367
+ "eval_steps_per_second": 112.758,
368
+ "step": 4689
369
+ }
370
+ ],
371
+ "logging_steps": 100,
372
+ "max_steps": 7815,
373
+ "num_input_tokens_seen": 0,
374
+ "num_train_epochs": 5,
375
+ "save_steps": 500,
376
+ "stateful_callbacks": {
377
+ "TrainerControl": {
378
+ "args": {
379
+ "should_epoch_stop": false,
380
+ "should_evaluate": false,
381
+ "should_log": false,
382
+ "should_save": true,
383
+ "should_training_stop": false
384
+ },
385
+ "attributes": {}
386
+ }
387
+ },
388
+ "total_flos": 47643264000000.0,
389
+ "train_batch_size": 16,
390
+ "trial_name": null,
391
+ "trial_params": null
392
+ }
checkpoint-4689/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201
checkpoint-6252/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 128,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 2,
21
+ "num_hidden_layers": 2,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
checkpoint-6252/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edcfd43658753d747967f6c65bb2850d5c3ac0e8d3d1fde9fb15be1c25f31379
3
+ size 17549304
checkpoint-6252/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71fe50cbf9b4fd235d2a2a9d5c59532548cee90c96ac9a389f6e945c90b8cbed
3
+ size 35124939
checkpoint-6252/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add48656b491a29e576e41e18eeb0d936a6690b0fe71b8e7b8fb1862a5c4edfa
3
+ size 14645
checkpoint-6252/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:632775edc5ea848beee3dfdd8bdea1dfbcfc0f819e9209a3f6b1a4c76c74812a
3
+ size 1465
checkpoint-6252/trainer_state.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6252,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06397952655150352,
14
+ "grad_norm": 2.6240322589874268,
15
+ "learning_rate": 1.974664107485605e-05,
16
+ "loss": 0.692889175415039,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.12795905310300704,
21
+ "grad_norm": 1.6500952243804932,
22
+ "learning_rate": 1.9490722968650032e-05,
23
+ "loss": 0.6690621948242188,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.19193857965451055,
28
+ "grad_norm": 5.539139747619629,
29
+ "learning_rate": 1.923480486244402e-05,
30
+ "loss": 0.6443107604980469,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.2559181062060141,
35
+ "grad_norm": 9.561639785766602,
36
+ "learning_rate": 1.8978886756238006e-05,
37
+ "loss": 0.6174827194213868,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.3198976327575176,
42
+ "grad_norm": 3.9405109882354736,
43
+ "learning_rate": 1.8722968650031992e-05,
44
+ "loss": 0.603485221862793,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.3838771593090211,
49
+ "grad_norm": 5.654637336730957,
50
+ "learning_rate": 1.846705054382598e-05,
51
+ "loss": 0.5690624237060546,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.44785668586052463,
56
+ "grad_norm": 6.288730144500732,
57
+ "learning_rate": 1.8211132437619962e-05,
58
+ "loss": 0.5585448455810547,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.5118362124120281,
63
+ "grad_norm": 7.303677558898926,
64
+ "learning_rate": 1.795521433141395e-05,
65
+ "loss": 0.5453743362426757,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.5758157389635317,
70
+ "grad_norm": 4.167948246002197,
71
+ "learning_rate": 1.7699296225207935e-05,
72
+ "loss": 0.5284878540039063,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.6397952655150352,
77
+ "grad_norm": 6.335079669952393,
78
+ "learning_rate": 1.744337811900192e-05,
79
+ "loss": 0.497203369140625,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.7037747920665387,
84
+ "grad_norm": 9.398835182189941,
85
+ "learning_rate": 1.7187460012795908e-05,
86
+ "loss": 0.4662479782104492,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.7677543186180422,
91
+ "grad_norm": 17.086259841918945,
92
+ "learning_rate": 1.693154190658989e-05,
93
+ "loss": 0.4858377456665039,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.8317338451695457,
98
+ "grad_norm": 11.53818416595459,
99
+ "learning_rate": 1.6675623800383878e-05,
100
+ "loss": 0.45729072570800783,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.8957133717210493,
105
+ "grad_norm": 22.93279457092285,
106
+ "learning_rate": 1.6419705694177864e-05,
107
+ "loss": 0.45677249908447265,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.9596928982725528,
112
+ "grad_norm": 15.763897895812988,
113
+ "learning_rate": 1.616378758797185e-05,
114
+ "loss": 0.4614885711669922,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.0,
119
+ "eval_accuracy": 0.81124,
120
+ "eval_f1": 0.8026596411993476,
121
+ "eval_loss": 0.4272981286048889,
122
+ "eval_precision": 0.8408832033645842,
123
+ "eval_recall": 0.76776,
124
+ "eval_runtime": 16.5749,
125
+ "eval_samples_per_second": 1508.308,
126
+ "eval_steps_per_second": 94.299,
127
+ "step": 1563
128
+ },
129
+ {
130
+ "epoch": 1.0236724248240563,
131
+ "grad_norm": 16.664216995239258,
132
+ "learning_rate": 1.5907869481765838e-05,
133
+ "loss": 0.44625389099121093,
134
+ "step": 1600
135
+ },
136
+ {
137
+ "epoch": 1.0876519513755598,
138
+ "grad_norm": 23.07805824279785,
139
+ "learning_rate": 1.565195137555982e-05,
140
+ "loss": 0.44664024353027343,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 1.1516314779270633,
145
+ "grad_norm": 8.32645320892334,
146
+ "learning_rate": 1.5396033269353807e-05,
147
+ "loss": 0.42939315795898436,
148
+ "step": 1800
149
+ },
150
+ {
151
+ "epoch": 1.2156110044785668,
152
+ "grad_norm": 7.903297424316406,
153
+ "learning_rate": 1.5140115163147796e-05,
154
+ "loss": 0.41601951599121095,
155
+ "step": 1900
156
+ },
157
+ {
158
+ "epoch": 1.2795905310300704,
159
+ "grad_norm": 11.639277458190918,
160
+ "learning_rate": 1.488419705694178e-05,
161
+ "loss": 0.39629173278808594,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 1.3435700575815739,
166
+ "grad_norm": 6.199832916259766,
167
+ "learning_rate": 1.4628278950735765e-05,
168
+ "loss": 0.3827814102172852,
169
+ "step": 2100
170
+ },
171
+ {
172
+ "epoch": 1.4075495841330774,
173
+ "grad_norm": 17.21854019165039,
174
+ "learning_rate": 1.4372360844529752e-05,
175
+ "loss": 0.41142051696777343,
176
+ "step": 2200
177
+ },
178
+ {
179
+ "epoch": 1.471529110684581,
180
+ "grad_norm": 15.716930389404297,
181
+ "learning_rate": 1.4116442738323737e-05,
182
+ "loss": 0.38613304138183596,
183
+ "step": 2300
184
+ },
185
+ {
186
+ "epoch": 1.5355086372360844,
187
+ "grad_norm": 6.01155948638916,
188
+ "learning_rate": 1.3860524632117725e-05,
189
+ "loss": 0.41208648681640625,
190
+ "step": 2400
191
+ },
192
+ {
193
+ "epoch": 1.599488163787588,
194
+ "grad_norm": 20.55832862854004,
195
+ "learning_rate": 1.360460652591171e-05,
196
+ "loss": 0.38446582794189454,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 1.6634676903390915,
201
+ "grad_norm": 11.465950012207031,
202
+ "learning_rate": 1.3348688419705695e-05,
203
+ "loss": 0.37646575927734377,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 1.727447216890595,
208
+ "grad_norm": 3.505220651626587,
209
+ "learning_rate": 1.3092770313499681e-05,
210
+ "loss": 0.3866144561767578,
211
+ "step": 2700
212
+ },
213
+ {
214
+ "epoch": 1.7914267434420985,
215
+ "grad_norm": 27.13107681274414,
216
+ "learning_rate": 1.2836852207293666e-05,
217
+ "loss": 0.4025085830688477,
218
+ "step": 2800
219
+ },
220
+ {
221
+ "epoch": 1.855406269993602,
222
+ "grad_norm": 7.345473766326904,
223
+ "learning_rate": 1.2580934101087654e-05,
224
+ "loss": 0.3890159225463867,
225
+ "step": 2900
226
+ },
227
+ {
228
+ "epoch": 1.9193857965451055,
229
+ "grad_norm": 24.46322250366211,
230
+ "learning_rate": 1.232501599488164e-05,
231
+ "loss": 0.381514778137207,
232
+ "step": 3000
233
+ },
234
+ {
235
+ "epoch": 1.983365323096609,
236
+ "grad_norm": 6.782553195953369,
237
+ "learning_rate": 1.2069097888675624e-05,
238
+ "loss": 0.3956758499145508,
239
+ "step": 3100
240
+ },
241
+ {
242
+ "epoch": 2.0,
243
+ "eval_accuracy": 0.8286,
244
+ "eval_f1": 0.8181779607077694,
245
+ "eval_loss": 0.3925381898880005,
246
+ "eval_precision": 0.8711484593837535,
247
+ "eval_recall": 0.77128,
248
+ "eval_runtime": 18.1817,
249
+ "eval_samples_per_second": 1375.01,
250
+ "eval_steps_per_second": 85.966,
251
+ "step": 3126
252
+ },
253
+ {
254
+ "epoch": 2.0473448496481126,
255
+ "grad_norm": 11.411704063415527,
256
+ "learning_rate": 1.181317978246961e-05,
257
+ "loss": 0.36985355377197265,
258
+ "step": 3200
259
+ },
260
+ {
261
+ "epoch": 2.111324376199616,
262
+ "grad_norm": 8.180529594421387,
263
+ "learning_rate": 1.1557261676263596e-05,
264
+ "loss": 0.34660816192626953,
265
+ "step": 3300
266
+ },
267
+ {
268
+ "epoch": 2.1753039027511196,
269
+ "grad_norm": 14.788076400756836,
270
+ "learning_rate": 1.1301343570057584e-05,
271
+ "loss": 0.3805205154418945,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 2.239283429302623,
276
+ "grad_norm": 38.90132522583008,
277
+ "learning_rate": 1.1045425463851569e-05,
278
+ "loss": 0.36612781524658206,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 2.3032629558541267,
283
+ "grad_norm": 6.464240550994873,
284
+ "learning_rate": 1.0789507357645555e-05,
285
+ "loss": 0.3882516098022461,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 2.36724248240563,
290
+ "grad_norm": 12.745105743408203,
291
+ "learning_rate": 1.053358925143954e-05,
292
+ "loss": 0.37442840576171876,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 2.4312220089571337,
297
+ "grad_norm": 18.671857833862305,
298
+ "learning_rate": 1.0277671145233525e-05,
299
+ "loss": 0.3458353424072266,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 2.495201535508637,
304
+ "grad_norm": 14.760592460632324,
305
+ "learning_rate": 1.0021753039027513e-05,
306
+ "loss": 0.3635056686401367,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 2.5591810620601407,
311
+ "grad_norm": 3.513972282409668,
312
+ "learning_rate": 9.765834932821498e-06,
313
+ "loss": 0.33845436096191406,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 2.6231605886116443,
318
+ "grad_norm": 7.455184459686279,
319
+ "learning_rate": 9.509916826615485e-06,
320
+ "loss": 0.36538402557373045,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 2.6871401151631478,
325
+ "grad_norm": 35.58430862426758,
326
+ "learning_rate": 9.25399872040947e-06,
327
+ "loss": 0.38003883361816404,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 2.7511196417146513,
332
+ "grad_norm": 28.353757858276367,
333
+ "learning_rate": 8.998080614203456e-06,
334
+ "loss": 0.334156379699707,
335
+ "step": 4300
336
+ },
337
+ {
338
+ "epoch": 2.815099168266155,
339
+ "grad_norm": 10.301733016967773,
340
+ "learning_rate": 8.742162507997441e-06,
341
+ "loss": 0.36084671020507814,
342
+ "step": 4400
343
+ },
344
+ {
345
+ "epoch": 2.8790786948176583,
346
+ "grad_norm": 8.111748695373535,
347
+ "learning_rate": 8.486244401791428e-06,
348
+ "loss": 0.3733618545532227,
349
+ "step": 4500
350
+ },
351
+ {
352
+ "epoch": 2.943058221369162,
353
+ "grad_norm": 7.716980934143066,
354
+ "learning_rate": 8.230326295585414e-06,
355
+ "loss": 0.37022560119628906,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 3.0,
360
+ "eval_accuracy": 0.8426,
361
+ "eval_f1": 0.8457286235151135,
362
+ "eval_loss": 0.36446496844291687,
363
+ "eval_precision": 0.8292457907280695,
364
+ "eval_recall": 0.86288,
365
+ "eval_runtime": 13.8616,
366
+ "eval_samples_per_second": 1803.544,
367
+ "eval_steps_per_second": 112.758,
368
+ "step": 4689
369
+ },
370
+ {
371
+ "epoch": 3.0070377479206654,
372
+ "grad_norm": 8.766412734985352,
373
+ "learning_rate": 7.974408189379399e-06,
374
+ "loss": 0.3578424835205078,
375
+ "step": 4700
376
+ },
377
+ {
378
+ "epoch": 3.071017274472169,
379
+ "grad_norm": 25.10895538330078,
380
+ "learning_rate": 7.718490083173386e-06,
381
+ "loss": 0.36146461486816406,
382
+ "step": 4800
383
+ },
384
+ {
385
+ "epoch": 3.1349968010236724,
386
+ "grad_norm": 2.3490066528320312,
387
+ "learning_rate": 7.462571976967371e-06,
388
+ "loss": 0.33738922119140624,
389
+ "step": 4900
390
+ },
391
+ {
392
+ "epoch": 3.198976327575176,
393
+ "grad_norm": 13.975132942199707,
394
+ "learning_rate": 7.206653870761356e-06,
395
+ "loss": 0.326907958984375,
396
+ "step": 5000
397
+ },
398
+ {
399
+ "epoch": 3.2629558541266794,
400
+ "grad_norm": 6.52653694152832,
401
+ "learning_rate": 6.950735764555343e-06,
402
+ "loss": 0.34798374176025393,
403
+ "step": 5100
404
+ },
405
+ {
406
+ "epoch": 3.326935380678183,
407
+ "grad_norm": 5.691891670227051,
408
+ "learning_rate": 6.694817658349328e-06,
409
+ "loss": 0.36636493682861326,
410
+ "step": 5200
411
+ },
412
+ {
413
+ "epoch": 3.3909149072296865,
414
+ "grad_norm": 25.656173706054688,
415
+ "learning_rate": 6.438899552143315e-06,
416
+ "loss": 0.3263176727294922,
417
+ "step": 5300
418
+ },
419
+ {
420
+ "epoch": 3.45489443378119,
421
+ "grad_norm": 10.740619659423828,
422
+ "learning_rate": 6.182981445937301e-06,
423
+ "loss": 0.36173702239990235,
424
+ "step": 5400
425
+ },
426
+ {
427
+ "epoch": 3.5188739603326935,
428
+ "grad_norm": 6.179443836212158,
429
+ "learning_rate": 5.927063339731286e-06,
430
+ "loss": 0.3383364486694336,
431
+ "step": 5500
432
+ },
433
+ {
434
+ "epoch": 3.582853486884197,
435
+ "grad_norm": 8.635146141052246,
436
+ "learning_rate": 5.671145233525273e-06,
437
+ "loss": 0.3060850715637207,
438
+ "step": 5600
439
+ },
440
+ {
441
+ "epoch": 3.6468330134357005,
442
+ "grad_norm": 12.844294548034668,
443
+ "learning_rate": 5.415227127319258e-06,
444
+ "loss": 0.3492905807495117,
445
+ "step": 5700
446
+ },
447
+ {
448
+ "epoch": 3.710812539987204,
449
+ "grad_norm": 3.7722866535186768,
450
+ "learning_rate": 5.159309021113244e-06,
451
+ "loss": 0.352115592956543,
452
+ "step": 5800
453
+ },
454
+ {
455
+ "epoch": 3.7747920665387076,
456
+ "grad_norm": 16.642221450805664,
457
+ "learning_rate": 4.90339091490723e-06,
458
+ "loss": 0.3466293716430664,
459
+ "step": 5900
460
+ },
461
+ {
462
+ "epoch": 3.838771593090211,
463
+ "grad_norm": 22.054513931274414,
464
+ "learning_rate": 4.647472808701216e-06,
465
+ "loss": 0.3272230529785156,
466
+ "step": 6000
467
+ },
468
+ {
469
+ "epoch": 3.9027511196417146,
470
+ "grad_norm": 5.188161849975586,
471
+ "learning_rate": 4.391554702495202e-06,
472
+ "loss": 0.3530220794677734,
473
+ "step": 6100
474
+ },
475
+ {
476
+ "epoch": 3.966730646193218,
477
+ "grad_norm": 24.137426376342773,
478
+ "learning_rate": 4.135636596289187e-06,
479
+ "loss": 0.3384724807739258,
480
+ "step": 6200
481
+ },
482
+ {
483
+ "epoch": 4.0,
484
+ "eval_accuracy": 0.8466,
485
+ "eval_f1": 0.845081801656231,
486
+ "eval_loss": 0.36203694343566895,
487
+ "eval_precision": 0.8535291717666259,
488
+ "eval_recall": 0.8368,
489
+ "eval_runtime": 16.3968,
490
+ "eval_samples_per_second": 1524.69,
491
+ "eval_steps_per_second": 95.324,
492
+ "step": 6252
493
+ }
494
+ ],
495
+ "logging_steps": 100,
496
+ "max_steps": 7815,
497
+ "num_input_tokens_seen": 0,
498
+ "num_train_epochs": 5,
499
+ "save_steps": 500,
500
+ "stateful_callbacks": {
501
+ "TrainerControl": {
502
+ "args": {
503
+ "should_epoch_stop": false,
504
+ "should_evaluate": false,
505
+ "should_log": false,
506
+ "should_save": true,
507
+ "should_training_stop": false
508
+ },
509
+ "attributes": {}
510
+ }
511
+ },
512
+ "total_flos": 63524352000000.0,
513
+ "train_batch_size": 16,
514
+ "trial_name": null,
515
+ "trial_params": null
516
+ }
checkpoint-6252/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201
checkpoint-7815/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 128,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 2,
21
+ "num_hidden_layers": 2,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
checkpoint-7815/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e463d4b60aefb340640cb8a4ae32c81c7caded7588ca9419eda6a28f12fd1f8e
3
+ size 17549304
checkpoint-7815/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e124969e832e1c996d63f0e09f1f3d1ca2ca540bf954a759cea3cc5e0964a227
3
+ size 35124939
checkpoint-7815/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb4ae28c66f4c0741b11f2d6f97f2592ad1a57f30642f46ec3548339edc75d38
3
+ size 14645
checkpoint-7815/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e05219b4abffcf7501b383079e0bc0173a8a1d79fa49c25677a005a669925d
3
+ size 1465
checkpoint-7815/trainer_state.json ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 7815,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06397952655150352,
14
+ "grad_norm": 2.6240322589874268,
15
+ "learning_rate": 1.974664107485605e-05,
16
+ "loss": 0.692889175415039,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.12795905310300704,
21
+ "grad_norm": 1.6500952243804932,
22
+ "learning_rate": 1.9490722968650032e-05,
23
+ "loss": 0.6690621948242188,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.19193857965451055,
28
+ "grad_norm": 5.539139747619629,
29
+ "learning_rate": 1.923480486244402e-05,
30
+ "loss": 0.6443107604980469,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.2559181062060141,
35
+ "grad_norm": 9.561639785766602,
36
+ "learning_rate": 1.8978886756238006e-05,
37
+ "loss": 0.6174827194213868,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.3198976327575176,
42
+ "grad_norm": 3.9405109882354736,
43
+ "learning_rate": 1.8722968650031992e-05,
44
+ "loss": 0.603485221862793,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.3838771593090211,
49
+ "grad_norm": 5.654637336730957,
50
+ "learning_rate": 1.846705054382598e-05,
51
+ "loss": 0.5690624237060546,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.44785668586052463,
56
+ "grad_norm": 6.288730144500732,
57
+ "learning_rate": 1.8211132437619962e-05,
58
+ "loss": 0.5585448455810547,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.5118362124120281,
63
+ "grad_norm": 7.303677558898926,
64
+ "learning_rate": 1.795521433141395e-05,
65
+ "loss": 0.5453743362426757,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.5758157389635317,
70
+ "grad_norm": 4.167948246002197,
71
+ "learning_rate": 1.7699296225207935e-05,
72
+ "loss": 0.5284878540039063,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.6397952655150352,
77
+ "grad_norm": 6.335079669952393,
78
+ "learning_rate": 1.744337811900192e-05,
79
+ "loss": 0.497203369140625,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.7037747920665387,
84
+ "grad_norm": 9.398835182189941,
85
+ "learning_rate": 1.7187460012795908e-05,
86
+ "loss": 0.4662479782104492,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.7677543186180422,
91
+ "grad_norm": 17.086259841918945,
92
+ "learning_rate": 1.693154190658989e-05,
93
+ "loss": 0.4858377456665039,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.8317338451695457,
98
+ "grad_norm": 11.53818416595459,
99
+ "learning_rate": 1.6675623800383878e-05,
100
+ "loss": 0.45729072570800783,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.8957133717210493,
105
+ "grad_norm": 22.93279457092285,
106
+ "learning_rate": 1.6419705694177864e-05,
107
+ "loss": 0.45677249908447265,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.9596928982725528,
112
+ "grad_norm": 15.763897895812988,
113
+ "learning_rate": 1.616378758797185e-05,
114
+ "loss": 0.4614885711669922,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.0,
119
+ "eval_accuracy": 0.81124,
120
+ "eval_f1": 0.8026596411993476,
121
+ "eval_loss": 0.4272981286048889,
122
+ "eval_precision": 0.8408832033645842,
123
+ "eval_recall": 0.76776,
124
+ "eval_runtime": 16.5749,
125
+ "eval_samples_per_second": 1508.308,
126
+ "eval_steps_per_second": 94.299,
127
+ "step": 1563
128
+ },
129
+ {
130
+ "epoch": 1.0236724248240563,
131
+ "grad_norm": 16.664216995239258,
132
+ "learning_rate": 1.5907869481765838e-05,
133
+ "loss": 0.44625389099121093,
134
+ "step": 1600
135
+ },
136
+ {
137
+ "epoch": 1.0876519513755598,
138
+ "grad_norm": 23.07805824279785,
139
+ "learning_rate": 1.565195137555982e-05,
140
+ "loss": 0.44664024353027343,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 1.1516314779270633,
145
+ "grad_norm": 8.32645320892334,
146
+ "learning_rate": 1.5396033269353807e-05,
147
+ "loss": 0.42939315795898436,
148
+ "step": 1800
149
+ },
150
+ {
151
+ "epoch": 1.2156110044785668,
152
+ "grad_norm": 7.903297424316406,
153
+ "learning_rate": 1.5140115163147796e-05,
154
+ "loss": 0.41601951599121095,
155
+ "step": 1900
156
+ },
157
+ {
158
+ "epoch": 1.2795905310300704,
159
+ "grad_norm": 11.639277458190918,
160
+ "learning_rate": 1.488419705694178e-05,
161
+ "loss": 0.39629173278808594,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 1.3435700575815739,
166
+ "grad_norm": 6.199832916259766,
167
+ "learning_rate": 1.4628278950735765e-05,
168
+ "loss": 0.3827814102172852,
169
+ "step": 2100
170
+ },
171
+ {
172
+ "epoch": 1.4075495841330774,
173
+ "grad_norm": 17.21854019165039,
174
+ "learning_rate": 1.4372360844529752e-05,
175
+ "loss": 0.41142051696777343,
176
+ "step": 2200
177
+ },
178
+ {
179
+ "epoch": 1.471529110684581,
180
+ "grad_norm": 15.716930389404297,
181
+ "learning_rate": 1.4116442738323737e-05,
182
+ "loss": 0.38613304138183596,
183
+ "step": 2300
184
+ },
185
+ {
186
+ "epoch": 1.5355086372360844,
187
+ "grad_norm": 6.01155948638916,
188
+ "learning_rate": 1.3860524632117725e-05,
189
+ "loss": 0.41208648681640625,
190
+ "step": 2400
191
+ },
192
+ {
193
+ "epoch": 1.599488163787588,
194
+ "grad_norm": 20.55832862854004,
195
+ "learning_rate": 1.360460652591171e-05,
196
+ "loss": 0.38446582794189454,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 1.6634676903390915,
201
+ "grad_norm": 11.465950012207031,
202
+ "learning_rate": 1.3348688419705695e-05,
203
+ "loss": 0.37646575927734377,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 1.727447216890595,
208
+ "grad_norm": 3.505220651626587,
209
+ "learning_rate": 1.3092770313499681e-05,
210
+ "loss": 0.3866144561767578,
211
+ "step": 2700
212
+ },
213
+ {
214
+ "epoch": 1.7914267434420985,
215
+ "grad_norm": 27.13107681274414,
216
+ "learning_rate": 1.2836852207293666e-05,
217
+ "loss": 0.4025085830688477,
218
+ "step": 2800
219
+ },
220
+ {
221
+ "epoch": 1.855406269993602,
222
+ "grad_norm": 7.345473766326904,
223
+ "learning_rate": 1.2580934101087654e-05,
224
+ "loss": 0.3890159225463867,
225
+ "step": 2900
226
+ },
227
+ {
228
+ "epoch": 1.9193857965451055,
229
+ "grad_norm": 24.46322250366211,
230
+ "learning_rate": 1.232501599488164e-05,
231
+ "loss": 0.381514778137207,
232
+ "step": 3000
233
+ },
234
+ {
235
+ "epoch": 1.983365323096609,
236
+ "grad_norm": 6.782553195953369,
237
+ "learning_rate": 1.2069097888675624e-05,
238
+ "loss": 0.3956758499145508,
239
+ "step": 3100
240
+ },
241
+ {
242
+ "epoch": 2.0,
243
+ "eval_accuracy": 0.8286,
244
+ "eval_f1": 0.8181779607077694,
245
+ "eval_loss": 0.3925381898880005,
246
+ "eval_precision": 0.8711484593837535,
247
+ "eval_recall": 0.77128,
248
+ "eval_runtime": 18.1817,
249
+ "eval_samples_per_second": 1375.01,
250
+ "eval_steps_per_second": 85.966,
251
+ "step": 3126
252
+ },
253
+ {
254
+ "epoch": 2.0473448496481126,
255
+ "grad_norm": 11.411704063415527,
256
+ "learning_rate": 1.181317978246961e-05,
257
+ "loss": 0.36985355377197265,
258
+ "step": 3200
259
+ },
260
+ {
261
+ "epoch": 2.111324376199616,
262
+ "grad_norm": 8.180529594421387,
263
+ "learning_rate": 1.1557261676263596e-05,
264
+ "loss": 0.34660816192626953,
265
+ "step": 3300
266
+ },
267
+ {
268
+ "epoch": 2.1753039027511196,
269
+ "grad_norm": 14.788076400756836,
270
+ "learning_rate": 1.1301343570057584e-05,
271
+ "loss": 0.3805205154418945,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 2.239283429302623,
276
+ "grad_norm": 38.90132522583008,
277
+ "learning_rate": 1.1045425463851569e-05,
278
+ "loss": 0.36612781524658206,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 2.3032629558541267,
283
+ "grad_norm": 6.464240550994873,
284
+ "learning_rate": 1.0789507357645555e-05,
285
+ "loss": 0.3882516098022461,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 2.36724248240563,
290
+ "grad_norm": 12.745105743408203,
291
+ "learning_rate": 1.053358925143954e-05,
292
+ "loss": 0.37442840576171876,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 2.4312220089571337,
297
+ "grad_norm": 18.671857833862305,
298
+ "learning_rate": 1.0277671145233525e-05,
299
+ "loss": 0.3458353424072266,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 2.495201535508637,
304
+ "grad_norm": 14.760592460632324,
305
+ "learning_rate": 1.0021753039027513e-05,
306
+ "loss": 0.3635056686401367,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 2.5591810620601407,
311
+ "grad_norm": 3.513972282409668,
312
+ "learning_rate": 9.765834932821498e-06,
313
+ "loss": 0.33845436096191406,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 2.6231605886116443,
318
+ "grad_norm": 7.455184459686279,
319
+ "learning_rate": 9.509916826615485e-06,
320
+ "loss": 0.36538402557373045,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 2.6871401151631478,
325
+ "grad_norm": 35.58430862426758,
326
+ "learning_rate": 9.25399872040947e-06,
327
+ "loss": 0.38003883361816404,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 2.7511196417146513,
332
+ "grad_norm": 28.353757858276367,
333
+ "learning_rate": 8.998080614203456e-06,
334
+ "loss": 0.334156379699707,
335
+ "step": 4300
336
+ },
337
+ {
338
+ "epoch": 2.815099168266155,
339
+ "grad_norm": 10.301733016967773,
340
+ "learning_rate": 8.742162507997441e-06,
341
+ "loss": 0.36084671020507814,
342
+ "step": 4400
343
+ },
344
+ {
345
+ "epoch": 2.8790786948176583,
346
+ "grad_norm": 8.111748695373535,
347
+ "learning_rate": 8.486244401791428e-06,
348
+ "loss": 0.3733618545532227,
349
+ "step": 4500
350
+ },
351
+ {
352
+ "epoch": 2.943058221369162,
353
+ "grad_norm": 7.716980934143066,
354
+ "learning_rate": 8.230326295585414e-06,
355
+ "loss": 0.37022560119628906,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 3.0,
360
+ "eval_accuracy": 0.8426,
361
+ "eval_f1": 0.8457286235151135,
362
+ "eval_loss": 0.36446496844291687,
363
+ "eval_precision": 0.8292457907280695,
364
+ "eval_recall": 0.86288,
365
+ "eval_runtime": 13.8616,
366
+ "eval_samples_per_second": 1803.544,
367
+ "eval_steps_per_second": 112.758,
368
+ "step": 4689
369
+ },
370
+ {
371
+ "epoch": 3.0070377479206654,
372
+ "grad_norm": 8.766412734985352,
373
+ "learning_rate": 7.974408189379399e-06,
374
+ "loss": 0.3578424835205078,
375
+ "step": 4700
376
+ },
377
+ {
378
+ "epoch": 3.071017274472169,
379
+ "grad_norm": 25.10895538330078,
380
+ "learning_rate": 7.718490083173386e-06,
381
+ "loss": 0.36146461486816406,
382
+ "step": 4800
383
+ },
384
+ {
385
+ "epoch": 3.1349968010236724,
386
+ "grad_norm": 2.3490066528320312,
387
+ "learning_rate": 7.462571976967371e-06,
388
+ "loss": 0.33738922119140624,
389
+ "step": 4900
390
+ },
391
+ {
392
+ "epoch": 3.198976327575176,
393
+ "grad_norm": 13.975132942199707,
394
+ "learning_rate": 7.206653870761356e-06,
395
+ "loss": 0.326907958984375,
396
+ "step": 5000
397
+ },
398
+ {
399
+ "epoch": 3.2629558541266794,
400
+ "grad_norm": 6.52653694152832,
401
+ "learning_rate": 6.950735764555343e-06,
402
+ "loss": 0.34798374176025393,
403
+ "step": 5100
404
+ },
405
+ {
406
+ "epoch": 3.326935380678183,
407
+ "grad_norm": 5.691891670227051,
408
+ "learning_rate": 6.694817658349328e-06,
409
+ "loss": 0.36636493682861326,
410
+ "step": 5200
411
+ },
412
+ {
413
+ "epoch": 3.3909149072296865,
414
+ "grad_norm": 25.656173706054688,
415
+ "learning_rate": 6.438899552143315e-06,
416
+ "loss": 0.3263176727294922,
417
+ "step": 5300
418
+ },
419
+ {
420
+ "epoch": 3.45489443378119,
421
+ "grad_norm": 10.740619659423828,
422
+ "learning_rate": 6.182981445937301e-06,
423
+ "loss": 0.36173702239990235,
424
+ "step": 5400
425
+ },
426
+ {
427
+ "epoch": 3.5188739603326935,
428
+ "grad_norm": 6.179443836212158,
429
+ "learning_rate": 5.927063339731286e-06,
430
+ "loss": 0.3383364486694336,
431
+ "step": 5500
432
+ },
433
+ {
434
+ "epoch": 3.582853486884197,
435
+ "grad_norm": 8.635146141052246,
436
+ "learning_rate": 5.671145233525273e-06,
437
+ "loss": 0.3060850715637207,
438
+ "step": 5600
439
+ },
440
+ {
441
+ "epoch": 3.6468330134357005,
442
+ "grad_norm": 12.844294548034668,
443
+ "learning_rate": 5.415227127319258e-06,
444
+ "loss": 0.3492905807495117,
445
+ "step": 5700
446
+ },
447
+ {
448
+ "epoch": 3.710812539987204,
449
+ "grad_norm": 3.7722866535186768,
450
+ "learning_rate": 5.159309021113244e-06,
451
+ "loss": 0.352115592956543,
452
+ "step": 5800
453
+ },
454
+ {
455
+ "epoch": 3.7747920665387076,
456
+ "grad_norm": 16.642221450805664,
457
+ "learning_rate": 4.90339091490723e-06,
458
+ "loss": 0.3466293716430664,
459
+ "step": 5900
460
+ },
461
+ {
462
+ "epoch": 3.838771593090211,
463
+ "grad_norm": 22.054513931274414,
464
+ "learning_rate": 4.647472808701216e-06,
465
+ "loss": 0.3272230529785156,
466
+ "step": 6000
467
+ },
468
+ {
469
+ "epoch": 3.9027511196417146,
470
+ "grad_norm": 5.188161849975586,
471
+ "learning_rate": 4.391554702495202e-06,
472
+ "loss": 0.3530220794677734,
473
+ "step": 6100
474
+ },
475
+ {
476
+ "epoch": 3.966730646193218,
477
+ "grad_norm": 24.137426376342773,
478
+ "learning_rate": 4.135636596289187e-06,
479
+ "loss": 0.3384724807739258,
480
+ "step": 6200
481
+ },
482
+ {
483
+ "epoch": 4.0,
484
+ "eval_accuracy": 0.8466,
485
+ "eval_f1": 0.845081801656231,
486
+ "eval_loss": 0.36203694343566895,
487
+ "eval_precision": 0.8535291717666259,
488
+ "eval_recall": 0.8368,
489
+ "eval_runtime": 16.3968,
490
+ "eval_samples_per_second": 1524.69,
491
+ "eval_steps_per_second": 95.324,
492
+ "step": 6252
493
+ },
494
+ {
495
+ "epoch": 4.030710172744722,
496
+ "grad_norm": 22.76490592956543,
497
+ "learning_rate": 3.879718490083174e-06,
498
+ "loss": 0.32106273651123046,
499
+ "step": 6300
500
+ },
501
+ {
502
+ "epoch": 4.094689699296225,
503
+ "grad_norm": 6.895203113555908,
504
+ "learning_rate": 3.6238003838771595e-06,
505
+ "loss": 0.32752437591552735,
506
+ "step": 6400
507
+ },
508
+ {
509
+ "epoch": 4.158669225847729,
510
+ "grad_norm": 22.1447696685791,
511
+ "learning_rate": 3.3678822776711457e-06,
512
+ "loss": 0.32436195373535154,
513
+ "step": 6500
514
+ },
515
+ {
516
+ "epoch": 4.222648752399232,
517
+ "grad_norm": 2.5634121894836426,
518
+ "learning_rate": 3.111964171465132e-06,
519
+ "loss": 0.3267522811889648,
520
+ "step": 6600
521
+ },
522
+ {
523
+ "epoch": 4.286628278950736,
524
+ "grad_norm": 11.268364906311035,
525
+ "learning_rate": 2.856046065259117e-06,
526
+ "loss": 0.3346476364135742,
527
+ "step": 6700
528
+ },
529
+ {
530
+ "epoch": 4.350607805502239,
531
+ "grad_norm": 13.30309009552002,
532
+ "learning_rate": 2.6001279590531032e-06,
533
+ "loss": 0.3301812744140625,
534
+ "step": 6800
535
+ },
536
+ {
537
+ "epoch": 4.414587332053743,
538
+ "grad_norm": 36.629371643066406,
539
+ "learning_rate": 2.344209852847089e-06,
540
+ "loss": 0.3226265335083008,
541
+ "step": 6900
542
+ },
543
+ {
544
+ "epoch": 4.478566858605246,
545
+ "grad_norm": 21.645925521850586,
546
+ "learning_rate": 2.088291746641075e-06,
547
+ "loss": 0.3106839370727539,
548
+ "step": 7000
549
+ },
550
+ {
551
+ "epoch": 4.54254638515675,
552
+ "grad_norm": 15.254359245300293,
553
+ "learning_rate": 1.8323736404350608e-06,
554
+ "loss": 0.3521112442016602,
555
+ "step": 7100
556
+ },
557
+ {
558
+ "epoch": 4.606525911708253,
559
+ "grad_norm": 12.946036338806152,
560
+ "learning_rate": 1.576455534229047e-06,
561
+ "loss": 0.314422664642334,
562
+ "step": 7200
563
+ },
564
+ {
565
+ "epoch": 4.670505438259757,
566
+ "grad_norm": 9.554559707641602,
567
+ "learning_rate": 1.3205374280230327e-06,
568
+ "loss": 0.36591068267822263,
569
+ "step": 7300
570
+ },
571
+ {
572
+ "epoch": 4.73448496481126,
573
+ "grad_norm": 16.924890518188477,
574
+ "learning_rate": 1.0646193218170186e-06,
575
+ "loss": 0.3219599151611328,
576
+ "step": 7400
577
+ },
578
+ {
579
+ "epoch": 4.798464491362764,
580
+ "grad_norm": 12.717796325683594,
581
+ "learning_rate": 8.087012156110045e-07,
582
+ "loss": 0.3387944412231445,
583
+ "step": 7500
584
+ },
585
+ {
586
+ "epoch": 4.862444017914267,
587
+ "grad_norm": 17.223955154418945,
588
+ "learning_rate": 5.527831094049904e-07,
589
+ "loss": 0.31999959945678713,
590
+ "step": 7600
591
+ },
592
+ {
593
+ "epoch": 4.926423544465771,
594
+ "grad_norm": 8.725737571716309,
595
+ "learning_rate": 2.9686500319897637e-07,
596
+ "loss": 0.328571662902832,
597
+ "step": 7700
598
+ },
599
+ {
600
+ "epoch": 4.990403071017274,
601
+ "grad_norm": 47.724571228027344,
602
+ "learning_rate": 4.0946896992962254e-08,
603
+ "loss": 0.32502983093261717,
604
+ "step": 7800
605
+ },
606
+ {
607
+ "epoch": 5.0,
608
+ "eval_accuracy": 0.84804,
609
+ "eval_f1": 0.8465856317893632,
610
+ "eval_loss": 0.36020222306251526,
611
+ "eval_precision": 0.8547663703824513,
612
+ "eval_recall": 0.83856,
613
+ "eval_runtime": 20.3823,
614
+ "eval_samples_per_second": 1226.555,
615
+ "eval_steps_per_second": 76.684,
616
+ "step": 7815
617
+ }
618
+ ],
619
+ "logging_steps": 100,
620
+ "max_steps": 7815,
621
+ "num_input_tokens_seen": 0,
622
+ "num_train_epochs": 5,
623
+ "save_steps": 500,
624
+ "stateful_callbacks": {
625
+ "TrainerControl": {
626
+ "args": {
627
+ "should_epoch_stop": false,
628
+ "should_evaluate": false,
629
+ "should_log": false,
630
+ "should_save": true,
631
+ "should_training_stop": true
632
+ },
633
+ "attributes": {}
634
+ }
635
+ },
636
+ "total_flos": 79405440000000.0,
637
+ "train_batch_size": 16,
638
+ "trial_name": null,
639
+ "trial_params": null
640
+ }
checkpoint-7815/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201
config.json CHANGED
@@ -1,26 +1,29 @@
1
  {
2
- "_name_or_path": "prajjwal1/bert-tiny",
3
  "architectures": [
4
  "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
 
7
  "classifier_dropout": null,
 
 
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 128,
11
  "initializer_range": 0.02,
12
  "intermediate_size": 512,
 
13
  "layer_norm_eps": 1e-12,
14
  "max_position_embeddings": 512,
15
  "model_type": "bert",
16
  "num_attention_heads": 2,
17
  "num_hidden_layers": 2,
18
  "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
  "problem_type": "single_label_classification",
21
- "torch_dtype": "float32",
22
- "transformers_version": "4.38.2",
23
  "type_vocab_size": 2,
24
- "use_cache": true,
25
  "vocab_size": 30522
26
  }
 
1
  {
2
+ "add_cross_attention": false,
3
  "architectures": [
4
  "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
  "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
  "hidden_act": "gelu",
12
  "hidden_dropout_prob": 0.1,
13
  "hidden_size": 128,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 512,
16
+ "is_decoder": false,
17
  "layer_norm_eps": 1e-12,
18
  "max_position_embeddings": 512,
19
  "model_type": "bert",
20
  "num_attention_heads": 2,
21
  "num_hidden_layers": 2,
22
  "pad_token_id": 0,
 
23
  "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
  "type_vocab_size": 2,
27
+ "use_cache": false,
28
  "vocab_size": 30522
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e02859c5dc120e0414be1b029ae6e54b3674c2b262d8134f14d267e50807501
3
- size 17549312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e463d4b60aefb340640cb8a4ae32c81c7caded7588ca9419eda6a28f12fd1f8e
3
+ size 17549304
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,53 +1,10 @@
1
  {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "100": {
12
- "content": "[UNK]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "101": {
20
- "content": "[CLS]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "102": {
28
- "content": "[SEP]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "103": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
  "mask_token": "[MASK]",
49
  "model_max_length": 1000000000000000019884624838656,
50
- "never_split": null,
51
  "pad_token": "[PAD]",
52
  "sep_token": "[SEP]",
53
  "strip_accents": null,
 
1
  {
2
+ "backend": "tokenizers",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
  "mask_token": "[MASK]",
7
  "model_max_length": 1000000000000000019884624838656,
 
8
  "pad_token": "[PAD]",
9
  "sep_token": "[SEP]",
10
  "strip_accents": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4695fa03477caa6949d67ae3a0f17b42d6e931eb0f27a0e1ba906da857df1c2
3
- size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e5730834fb50b731e6ed2a038bc96471cea902a782210769d571be5fa37df0
3
+ size 5201