leomaurodesenv commited on
Commit
f77d768
verified
1 Parent(s): 7482f7f

feat: add tokenizer

Browse files
This view is limited to 50 files because it contains too many changes. 聽 See raw diff
Files changed (50) hide show
  1. checkpoint-1000/config.json +27 -0
  2. checkpoint-1000/model.safetensors +3 -0
  3. checkpoint-1000/optimizer.pt +3 -0
  4. checkpoint-1000/rng_state.pth +3 -0
  5. checkpoint-1000/scaler.pt +3 -0
  6. checkpoint-1000/scheduler.pt +3 -0
  7. checkpoint-1000/special_tokens_map.json +37 -0
  8. checkpoint-1000/tokenizer.json +0 -0
  9. checkpoint-1000/tokenizer_config.json +56 -0
  10. checkpoint-1000/trainer_state.json +363 -0
  11. checkpoint-1000/training_args.bin +3 -0
  12. checkpoint-1000/vocab.txt +0 -0
  13. checkpoint-1500/config.json +27 -0
  14. checkpoint-1500/model.safetensors +3 -0
  15. checkpoint-1500/optimizer.pt +3 -0
  16. checkpoint-1500/rng_state.pth +3 -0
  17. checkpoint-1500/scaler.pt +3 -0
  18. checkpoint-1500/scheduler.pt +3 -0
  19. checkpoint-1500/special_tokens_map.json +37 -0
  20. checkpoint-1500/tokenizer.json +0 -0
  21. checkpoint-1500/tokenizer_config.json +56 -0
  22. checkpoint-1500/trainer_state.json +523 -0
  23. checkpoint-1500/training_args.bin +3 -0
  24. checkpoint-1500/vocab.txt +0 -0
  25. checkpoint-2000/config.json +27 -0
  26. checkpoint-2000/model.safetensors +3 -0
  27. checkpoint-2000/optimizer.pt +3 -0
  28. checkpoint-2000/rng_state.pth +3 -0
  29. checkpoint-2000/scaler.pt +3 -0
  30. checkpoint-2000/scheduler.pt +3 -0
  31. checkpoint-2000/special_tokens_map.json +37 -0
  32. checkpoint-2000/tokenizer.json +0 -0
  33. checkpoint-2000/tokenizer_config.json +56 -0
  34. checkpoint-2000/trainer_state.json +683 -0
  35. checkpoint-2000/training_args.bin +3 -0
  36. checkpoint-2000/vocab.txt +0 -0
  37. checkpoint-2500/config.json +27 -0
  38. checkpoint-2500/model.safetensors +3 -0
  39. checkpoint-2500/optimizer.pt +3 -0
  40. checkpoint-2500/rng_state.pth +3 -0
  41. checkpoint-2500/scaler.pt +3 -0
  42. checkpoint-2500/scheduler.pt +3 -0
  43. checkpoint-2500/special_tokens_map.json +37 -0
  44. checkpoint-2500/tokenizer.json +0 -0
  45. checkpoint-2500/tokenizer_config.json +56 -0
  46. checkpoint-2500/trainer_state.json +843 -0
  47. checkpoint-2500/training_args.bin +3 -0
  48. checkpoint-2500/vocab.txt +0 -0
  49. checkpoint-3000/config.json +27 -0
  50. checkpoint-3000/model.safetensors +3 -0
checkpoint-1000/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "language": "english",
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "name": "Bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 30522
27
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8991e05b422c14c8e2fc5cd3f72feb17bd3677b6f1389322d3fed4a8999b98
3
+ size 435596088
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4f8538c773a31efc17fab7c7d668c5e79c64ca7830d4516cd0480460f573545
3
+ size 871311930
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb0e8e8bc64936aefc31ee7af1a5d1912fdfb5b07d689677f79a6259ca2c757
3
+ size 14244
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:595fac63b46180ff479afa6e7307045f9376a829986c66b1a84e9eba8d5f2706
3
+ size 988
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a63e8131bb66c9187759ce67cd24c6b46de8a1bef28b9d60900c3736a72e1e4
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 700,
3
+ "best_metric": 79.0143805614001,
4
+ "best_model_checkpoint": "bert-soccer-qa/checkpoint-500",
5
+ "epoch": 0.15537600994406464,
6
+ "eval_steps": 100,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007768800497203232,
14
+ "grad_norm": 9.272425651550293,
15
+ "learning_rate": 9.99925419515227e-06,
16
+ "loss": 1.1446,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.015537600994406464,
21
+ "grad_norm": 12.237289428710938,
22
+ "learning_rate": 9.99847731510255e-06,
23
+ "loss": 0.9455,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.015537600994406464,
28
+ "eval_HasAns_exact": 69.85208252238225,
29
+ "eval_HasAns_f1": 77.13603651720207,
30
+ "eval_HasAns_total": 25690,
31
+ "eval_best_exact": 69.85208252238225,
32
+ "eval_best_exact_thresh": 0.0,
33
+ "eval_best_f1": 77.13603651720207,
34
+ "eval_best_f1_thresh": 0.0,
35
+ "eval_exact": 69.85208252238225,
36
+ "eval_f1": 77.13603651720207,
37
+ "eval_loss": 0.8127343058586121,
38
+ "eval_runtime": 202.0338,
39
+ "eval_samples_per_second": 127.157,
40
+ "eval_steps_per_second": 7.949,
41
+ "eval_total": 25690,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023306401491609695,
46
+ "grad_norm": 5.626998424530029,
47
+ "learning_rate": 9.997700435052828e-06,
48
+ "loss": 0.8129,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.031075201988812928,
53
+ "grad_norm": 6.87526798248291,
54
+ "learning_rate": 9.996923555003108e-06,
55
+ "loss": 0.8743,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.031075201988812928,
60
+ "eval_HasAns_exact": 70.21409108602569,
61
+ "eval_HasAns_f1": 77.38207884007946,
62
+ "eval_HasAns_total": 25690,
63
+ "eval_best_exact": 70.21409108602569,
64
+ "eval_best_exact_thresh": 0.0,
65
+ "eval_best_f1": 77.38207884007946,
66
+ "eval_best_f1_thresh": 0.0,
67
+ "eval_exact": 70.21409108602569,
68
+ "eval_f1": 77.38207884007946,
69
+ "eval_loss": 0.7383215427398682,
70
+ "eval_runtime": 202.5157,
71
+ "eval_samples_per_second": 126.854,
72
+ "eval_steps_per_second": 7.93,
73
+ "eval_total": 25690,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03884400248601616,
78
+ "grad_norm": 11.461019515991211,
79
+ "learning_rate": 9.996146674953388e-06,
80
+ "loss": 0.7825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.04661280298321939,
85
+ "grad_norm": 9.630631446838379,
86
+ "learning_rate": 9.995369794903668e-06,
87
+ "loss": 0.7189,
88
+ "step": 300
89
+ },
90
+ {
91
+ "epoch": 0.04661280298321939,
92
+ "eval_HasAns_exact": 71.03931490852472,
93
+ "eval_HasAns_f1": 78.08897109574,
94
+ "eval_HasAns_total": 25690,
95
+ "eval_best_exact": 71.03931490852472,
96
+ "eval_best_exact_thresh": 0.0,
97
+ "eval_best_f1": 78.08897109574,
98
+ "eval_best_f1_thresh": 0.0,
99
+ "eval_exact": 71.03931490852472,
100
+ "eval_f1": 78.08897109574,
101
+ "eval_loss": 0.7194859981536865,
102
+ "eval_runtime": 202.3679,
103
+ "eval_samples_per_second": 126.947,
104
+ "eval_steps_per_second": 7.936,
105
+ "eval_total": 25690,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.054381603480422626,
110
+ "grad_norm": 11.377346992492676,
111
+ "learning_rate": 9.994592914853948e-06,
112
+ "loss": 0.7889,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.062150403977625855,
117
+ "grad_norm": 7.450821399688721,
118
+ "learning_rate": 9.993816034804228e-06,
119
+ "loss": 0.7367,
120
+ "step": 400
121
+ },
122
+ {
123
+ "epoch": 0.062150403977625855,
124
+ "eval_HasAns_exact": 71.08991825613079,
125
+ "eval_HasAns_f1": 78.00619214091435,
126
+ "eval_HasAns_total": 25690,
127
+ "eval_best_exact": 71.08991825613079,
128
+ "eval_best_exact_thresh": 0.0,
129
+ "eval_best_f1": 78.00619214091435,
130
+ "eval_best_f1_thresh": 0.0,
131
+ "eval_exact": 71.08991825613079,
132
+ "eval_f1": 78.00619214091435,
133
+ "eval_loss": 0.683600902557373,
134
+ "eval_runtime": 202.4728,
135
+ "eval_samples_per_second": 126.881,
136
+ "eval_steps_per_second": 7.932,
137
+ "eval_total": 25690,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.06991920447482909,
142
+ "grad_norm": 14.92029857635498,
143
+ "learning_rate": 9.993039154754508e-06,
144
+ "loss": 0.7838,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.07768800497203232,
149
+ "grad_norm": 9.030844688415527,
150
+ "learning_rate": 9.992262274704786e-06,
151
+ "loss": 0.6469,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.07768800497203232,
156
+ "eval_HasAns_exact": 71.42467886337096,
157
+ "eval_HasAns_f1": 78.23334576784337,
158
+ "eval_HasAns_total": 25690,
159
+ "eval_best_exact": 71.42467886337096,
160
+ "eval_best_exact_thresh": 0.0,
161
+ "eval_best_f1": 78.23334576784337,
162
+ "eval_best_f1_thresh": 0.0,
163
+ "eval_exact": 71.42467886337096,
164
+ "eval_f1": 78.23334576784337,
165
+ "eval_loss": 0.6646179556846619,
166
+ "eval_runtime": 202.6333,
167
+ "eval_samples_per_second": 126.781,
168
+ "eval_steps_per_second": 7.926,
169
+ "eval_total": 25690,
170
+ "step": 500
171
+ },
172
+ {
173
+ "epoch": 0.08545680546923555,
174
+ "grad_norm": 10.103166580200195,
175
+ "learning_rate": 9.991485394655066e-06,
176
+ "loss": 0.725,
177
+ "step": 550
178
+ },
179
+ {
180
+ "epoch": 0.09322560596643878,
181
+ "grad_norm": 6.986274242401123,
182
+ "learning_rate": 9.990708514605346e-06,
183
+ "loss": 0.6657,
184
+ "step": 600
185
+ },
186
+ {
187
+ "epoch": 0.09322560596643878,
188
+ "eval_HasAns_exact": 70.2841572596341,
189
+ "eval_HasAns_f1": 77.1247889416155,
190
+ "eval_HasAns_total": 25690,
191
+ "eval_best_exact": 70.2841572596341,
192
+ "eval_best_exact_thresh": 0.0,
193
+ "eval_best_f1": 77.1247889416155,
194
+ "eval_best_f1_thresh": 0.0,
195
+ "eval_exact": 70.2841572596341,
196
+ "eval_f1": 77.1247889416155,
197
+ "eval_loss": 0.6492825150489807,
198
+ "eval_runtime": 203.3854,
199
+ "eval_samples_per_second": 126.312,
200
+ "eval_steps_per_second": 7.896,
201
+ "eval_total": 25690,
202
+ "step": 600
203
+ },
204
+ {
205
+ "epoch": 0.10099440646364201,
206
+ "grad_norm": 10.636602401733398,
207
+ "learning_rate": 9.989931634555626e-06,
208
+ "loss": 0.7337,
209
+ "step": 650
210
+ },
211
+ {
212
+ "epoch": 0.10876320696084525,
213
+ "grad_norm": 8.252824783325195,
214
+ "learning_rate": 9.989154754505906e-06,
215
+ "loss": 0.662,
216
+ "step": 700
217
+ },
218
+ {
219
+ "epoch": 0.10876320696084525,
220
+ "eval_HasAns_exact": 72.22654729466718,
221
+ "eval_HasAns_f1": 79.0143805614001,
222
+ "eval_HasAns_total": 25690,
223
+ "eval_best_exact": 72.22654729466718,
224
+ "eval_best_exact_thresh": 0.0,
225
+ "eval_best_f1": 79.0143805614001,
226
+ "eval_best_f1_thresh": 0.0,
227
+ "eval_exact": 72.22654729466718,
228
+ "eval_f1": 79.0143805614001,
229
+ "eval_loss": 0.6340453028678894,
230
+ "eval_runtime": 202.8216,
231
+ "eval_samples_per_second": 126.663,
232
+ "eval_steps_per_second": 7.918,
233
+ "eval_total": 25690,
234
+ "step": 700
235
+ },
236
+ {
237
+ "epoch": 0.11653200745804848,
238
+ "grad_norm": 8.463785171508789,
239
+ "learning_rate": 9.988377874456184e-06,
240
+ "loss": 0.7265,
241
+ "step": 750
242
+ },
243
+ {
244
+ "epoch": 0.12430080795525171,
245
+ "grad_norm": 9.748174667358398,
246
+ "learning_rate": 9.987600994406464e-06,
247
+ "loss": 0.6969,
248
+ "step": 800
249
+ },
250
+ {
251
+ "epoch": 0.12430080795525171,
252
+ "eval_HasAns_exact": 72.1292331646555,
253
+ "eval_HasAns_f1": 78.83892105701732,
254
+ "eval_HasAns_total": 25690,
255
+ "eval_best_exact": 72.1292331646555,
256
+ "eval_best_exact_thresh": 0.0,
257
+ "eval_best_f1": 78.83892105701732,
258
+ "eval_best_f1_thresh": 0.0,
259
+ "eval_exact": 72.1292331646555,
260
+ "eval_f1": 78.83892105701732,
261
+ "eval_loss": 0.6085864901542664,
262
+ "eval_runtime": 202.3194,
263
+ "eval_samples_per_second": 126.977,
264
+ "eval_steps_per_second": 7.938,
265
+ "eval_total": 25690,
266
+ "step": 800
267
+ },
268
+ {
269
+ "epoch": 0.13206960845245494,
270
+ "grad_norm": 9.786517143249512,
271
+ "learning_rate": 9.986824114356744e-06,
272
+ "loss": 0.7111,
273
+ "step": 850
274
+ },
275
+ {
276
+ "epoch": 0.13983840894965818,
277
+ "grad_norm": 8.391840934753418,
278
+ "learning_rate": 9.986047234307024e-06,
279
+ "loss": 0.669,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.13983840894965818,
284
+ "eval_HasAns_exact": 71.89567925262749,
285
+ "eval_HasAns_f1": 78.61371443349637,
286
+ "eval_HasAns_total": 25690,
287
+ "eval_best_exact": 71.89567925262749,
288
+ "eval_best_exact_thresh": 0.0,
289
+ "eval_best_f1": 78.61371443349637,
290
+ "eval_best_f1_thresh": 0.0,
291
+ "eval_exact": 71.89567925262749,
292
+ "eval_f1": 78.61371443349637,
293
+ "eval_loss": 0.5937665104866028,
294
+ "eval_runtime": 202.8476,
295
+ "eval_samples_per_second": 126.647,
296
+ "eval_steps_per_second": 7.917,
297
+ "eval_total": 25690,
298
+ "step": 900
299
+ },
300
+ {
301
+ "epoch": 0.1476072094468614,
302
+ "grad_norm": 11.602773666381836,
303
+ "learning_rate": 9.985270354257304e-06,
304
+ "loss": 0.7253,
305
+ "step": 950
306
+ },
307
+ {
308
+ "epoch": 0.15537600994406464,
309
+ "grad_norm": 9.150772094726562,
310
+ "learning_rate": 9.984493474207582e-06,
311
+ "loss": 0.6676,
312
+ "step": 1000
313
+ },
314
+ {
315
+ "epoch": 0.15537600994406464,
316
+ "eval_HasAns_exact": 72.28493577267419,
317
+ "eval_HasAns_f1": 78.8876551339714,
318
+ "eval_HasAns_total": 25690,
319
+ "eval_best_exact": 72.28493577267419,
320
+ "eval_best_exact_thresh": 0.0,
321
+ "eval_best_f1": 78.8876551339714,
322
+ "eval_best_f1_thresh": 0.0,
323
+ "eval_exact": 72.28493577267419,
324
+ "eval_f1": 78.8876551339714,
325
+ "eval_loss": 0.5816648602485657,
326
+ "eval_runtime": 202.8184,
327
+ "eval_samples_per_second": 126.665,
328
+ "eval_steps_per_second": 7.918,
329
+ "eval_total": 25690,
330
+ "step": 1000
331
+ }
332
+ ],
333
+ "logging_steps": 50,
334
+ "max_steps": 643600,
335
+ "num_input_tokens_seen": 0,
336
+ "num_train_epochs": 100,
337
+ "save_steps": 500,
338
+ "stateful_callbacks": {
339
+ "EarlyStoppingCallback": {
340
+ "args": {
341
+ "early_stopping_patience": 10,
342
+ "early_stopping_threshold": 0.0
343
+ },
344
+ "attributes": {
345
+ "early_stopping_patience_counter": 3
346
+ }
347
+ },
348
+ "TrainerControl": {
349
+ "args": {
350
+ "should_epoch_stop": false,
351
+ "should_evaluate": false,
352
+ "should_log": false,
353
+ "should_save": true,
354
+ "should_training_stop": false
355
+ },
356
+ "attributes": {}
357
+ }
358
+ },
359
+ "total_flos": 8361496215552000.0,
360
+ "train_batch_size": 16,
361
+ "trial_name": null,
362
+ "trial_params": null
363
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6670cd309bc79c4638de77be66649432fb3a049d09959fd300cbc983c7c7160d
3
+ size 5304
checkpoint-1000/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "language": "english",
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "name": "Bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 30522
27
+ }
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5e818a28c003e9d5e292e503ba6339f07e2e958436351bbc477e0529c9b4fc
3
+ size 435596088
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769230f395510a338c8b7097d73cf1204d74261b21047d516f5174cc75b14eaf
3
+ size 871311930
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba088525666769e1cf08d578b612b2c50f01fcf002579f70e2b63fb97fbe2652
3
+ size 14244
checkpoint-1500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0808910e967b8f15cb8cc606607565e13ec061ebffc9c741d03a50211415c905
3
+ size 988
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63aa3335b5a4100e7305fdfb065b15de8bd1e2aeacb68069ea089d83049885f9
3
+ size 1064
checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 79.36155045560372,
4
+ "best_model_checkpoint": "bert-soccer-qa/checkpoint-1500",
5
+ "epoch": 0.23306401491609696,
6
+ "eval_steps": 100,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007768800497203232,
14
+ "grad_norm": 9.272425651550293,
15
+ "learning_rate": 9.99925419515227e-06,
16
+ "loss": 1.1446,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.015537600994406464,
21
+ "grad_norm": 12.237289428710938,
22
+ "learning_rate": 9.99847731510255e-06,
23
+ "loss": 0.9455,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.015537600994406464,
28
+ "eval_HasAns_exact": 69.85208252238225,
29
+ "eval_HasAns_f1": 77.13603651720207,
30
+ "eval_HasAns_total": 25690,
31
+ "eval_best_exact": 69.85208252238225,
32
+ "eval_best_exact_thresh": 0.0,
33
+ "eval_best_f1": 77.13603651720207,
34
+ "eval_best_f1_thresh": 0.0,
35
+ "eval_exact": 69.85208252238225,
36
+ "eval_f1": 77.13603651720207,
37
+ "eval_loss": 0.8127343058586121,
38
+ "eval_runtime": 202.0338,
39
+ "eval_samples_per_second": 127.157,
40
+ "eval_steps_per_second": 7.949,
41
+ "eval_total": 25690,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023306401491609695,
46
+ "grad_norm": 5.626998424530029,
47
+ "learning_rate": 9.997700435052828e-06,
48
+ "loss": 0.8129,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.031075201988812928,
53
+ "grad_norm": 6.87526798248291,
54
+ "learning_rate": 9.996923555003108e-06,
55
+ "loss": 0.8743,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.031075201988812928,
60
+ "eval_HasAns_exact": 70.21409108602569,
61
+ "eval_HasAns_f1": 77.38207884007946,
62
+ "eval_HasAns_total": 25690,
63
+ "eval_best_exact": 70.21409108602569,
64
+ "eval_best_exact_thresh": 0.0,
65
+ "eval_best_f1": 77.38207884007946,
66
+ "eval_best_f1_thresh": 0.0,
67
+ "eval_exact": 70.21409108602569,
68
+ "eval_f1": 77.38207884007946,
69
+ "eval_loss": 0.7383215427398682,
70
+ "eval_runtime": 202.5157,
71
+ "eval_samples_per_second": 126.854,
72
+ "eval_steps_per_second": 7.93,
73
+ "eval_total": 25690,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03884400248601616,
78
+ "grad_norm": 11.461019515991211,
79
+ "learning_rate": 9.996146674953388e-06,
80
+ "loss": 0.7825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.04661280298321939,
85
+ "grad_norm": 9.630631446838379,
86
+ "learning_rate": 9.995369794903668e-06,
87
+ "loss": 0.7189,
88
+ "step": 300
89
+ },
90
+ {
91
+ "epoch": 0.04661280298321939,
92
+ "eval_HasAns_exact": 71.03931490852472,
93
+ "eval_HasAns_f1": 78.08897109574,
94
+ "eval_HasAns_total": 25690,
95
+ "eval_best_exact": 71.03931490852472,
96
+ "eval_best_exact_thresh": 0.0,
97
+ "eval_best_f1": 78.08897109574,
98
+ "eval_best_f1_thresh": 0.0,
99
+ "eval_exact": 71.03931490852472,
100
+ "eval_f1": 78.08897109574,
101
+ "eval_loss": 0.7194859981536865,
102
+ "eval_runtime": 202.3679,
103
+ "eval_samples_per_second": 126.947,
104
+ "eval_steps_per_second": 7.936,
105
+ "eval_total": 25690,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.054381603480422626,
110
+ "grad_norm": 11.377346992492676,
111
+ "learning_rate": 9.994592914853948e-06,
112
+ "loss": 0.7889,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.062150403977625855,
117
+ "grad_norm": 7.450821399688721,
118
+ "learning_rate": 9.993816034804228e-06,
119
+ "loss": 0.7367,
120
+ "step": 400
121
+ },
122
+ {
123
+ "epoch": 0.062150403977625855,
124
+ "eval_HasAns_exact": 71.08991825613079,
125
+ "eval_HasAns_f1": 78.00619214091435,
126
+ "eval_HasAns_total": 25690,
127
+ "eval_best_exact": 71.08991825613079,
128
+ "eval_best_exact_thresh": 0.0,
129
+ "eval_best_f1": 78.00619214091435,
130
+ "eval_best_f1_thresh": 0.0,
131
+ "eval_exact": 71.08991825613079,
132
+ "eval_f1": 78.00619214091435,
133
+ "eval_loss": 0.683600902557373,
134
+ "eval_runtime": 202.4728,
135
+ "eval_samples_per_second": 126.881,
136
+ "eval_steps_per_second": 7.932,
137
+ "eval_total": 25690,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.06991920447482909,
142
+ "grad_norm": 14.92029857635498,
143
+ "learning_rate": 9.993039154754508e-06,
144
+ "loss": 0.7838,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.07768800497203232,
149
+ "grad_norm": 9.030844688415527,
150
+ "learning_rate": 9.992262274704786e-06,
151
+ "loss": 0.6469,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.07768800497203232,
156
+ "eval_HasAns_exact": 71.42467886337096,
157
+ "eval_HasAns_f1": 78.23334576784337,
158
+ "eval_HasAns_total": 25690,
159
+ "eval_best_exact": 71.42467886337096,
160
+ "eval_best_exact_thresh": 0.0,
161
+ "eval_best_f1": 78.23334576784337,
162
+ "eval_best_f1_thresh": 0.0,
163
+ "eval_exact": 71.42467886337096,
164
+ "eval_f1": 78.23334576784337,
165
+ "eval_loss": 0.6646179556846619,
166
+ "eval_runtime": 202.6333,
167
+ "eval_samples_per_second": 126.781,
168
+ "eval_steps_per_second": 7.926,
169
+ "eval_total": 25690,
170
+ "step": 500
171
+ },
172
+ {
173
+ "epoch": 0.08545680546923555,
174
+ "grad_norm": 10.103166580200195,
175
+ "learning_rate": 9.991485394655066e-06,
176
+ "loss": 0.725,
177
+ "step": 550
178
+ },
179
+ {
180
+ "epoch": 0.09322560596643878,
181
+ "grad_norm": 6.986274242401123,
182
+ "learning_rate": 9.990708514605346e-06,
183
+ "loss": 0.6657,
184
+ "step": 600
185
+ },
186
+ {
187
+ "epoch": 0.09322560596643878,
188
+ "eval_HasAns_exact": 70.2841572596341,
189
+ "eval_HasAns_f1": 77.1247889416155,
190
+ "eval_HasAns_total": 25690,
191
+ "eval_best_exact": 70.2841572596341,
192
+ "eval_best_exact_thresh": 0.0,
193
+ "eval_best_f1": 77.1247889416155,
194
+ "eval_best_f1_thresh": 0.0,
195
+ "eval_exact": 70.2841572596341,
196
+ "eval_f1": 77.1247889416155,
197
+ "eval_loss": 0.6492825150489807,
198
+ "eval_runtime": 203.3854,
199
+ "eval_samples_per_second": 126.312,
200
+ "eval_steps_per_second": 7.896,
201
+ "eval_total": 25690,
202
+ "step": 600
203
+ },
204
+ {
205
+ "epoch": 0.10099440646364201,
206
+ "grad_norm": 10.636602401733398,
207
+ "learning_rate": 9.989931634555626e-06,
208
+ "loss": 0.7337,
209
+ "step": 650
210
+ },
211
+ {
212
+ "epoch": 0.10876320696084525,
213
+ "grad_norm": 8.252824783325195,
214
+ "learning_rate": 9.989154754505906e-06,
215
+ "loss": 0.662,
216
+ "step": 700
217
+ },
218
+ {
219
+ "epoch": 0.10876320696084525,
220
+ "eval_HasAns_exact": 72.22654729466718,
221
+ "eval_HasAns_f1": 79.0143805614001,
222
+ "eval_HasAns_total": 25690,
223
+ "eval_best_exact": 72.22654729466718,
224
+ "eval_best_exact_thresh": 0.0,
225
+ "eval_best_f1": 79.0143805614001,
226
+ "eval_best_f1_thresh": 0.0,
227
+ "eval_exact": 72.22654729466718,
228
+ "eval_f1": 79.0143805614001,
229
+ "eval_loss": 0.6340453028678894,
230
+ "eval_runtime": 202.8216,
231
+ "eval_samples_per_second": 126.663,
232
+ "eval_steps_per_second": 7.918,
233
+ "eval_total": 25690,
234
+ "step": 700
235
+ },
236
+ {
237
+ "epoch": 0.11653200745804848,
238
+ "grad_norm": 8.463785171508789,
239
+ "learning_rate": 9.988377874456184e-06,
240
+ "loss": 0.7265,
241
+ "step": 750
242
+ },
243
+ {
244
+ "epoch": 0.12430080795525171,
245
+ "grad_norm": 9.748174667358398,
246
+ "learning_rate": 9.987600994406464e-06,
247
+ "loss": 0.6969,
248
+ "step": 800
249
+ },
250
+ {
251
+ "epoch": 0.12430080795525171,
252
+ "eval_HasAns_exact": 72.1292331646555,
253
+ "eval_HasAns_f1": 78.83892105701732,
254
+ "eval_HasAns_total": 25690,
255
+ "eval_best_exact": 72.1292331646555,
256
+ "eval_best_exact_thresh": 0.0,
257
+ "eval_best_f1": 78.83892105701732,
258
+ "eval_best_f1_thresh": 0.0,
259
+ "eval_exact": 72.1292331646555,
260
+ "eval_f1": 78.83892105701732,
261
+ "eval_loss": 0.6085864901542664,
262
+ "eval_runtime": 202.3194,
263
+ "eval_samples_per_second": 126.977,
264
+ "eval_steps_per_second": 7.938,
265
+ "eval_total": 25690,
266
+ "step": 800
267
+ },
268
+ {
269
+ "epoch": 0.13206960845245494,
270
+ "grad_norm": 9.786517143249512,
271
+ "learning_rate": 9.986824114356744e-06,
272
+ "loss": 0.7111,
273
+ "step": 850
274
+ },
275
+ {
276
+ "epoch": 0.13983840894965818,
277
+ "grad_norm": 8.391840934753418,
278
+ "learning_rate": 9.986047234307024e-06,
279
+ "loss": 0.669,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.13983840894965818,
284
+ "eval_HasAns_exact": 71.89567925262749,
285
+ "eval_HasAns_f1": 78.61371443349637,
286
+ "eval_HasAns_total": 25690,
287
+ "eval_best_exact": 71.89567925262749,
288
+ "eval_best_exact_thresh": 0.0,
289
+ "eval_best_f1": 78.61371443349637,
290
+ "eval_best_f1_thresh": 0.0,
291
+ "eval_exact": 71.89567925262749,
292
+ "eval_f1": 78.61371443349637,
293
+ "eval_loss": 0.5937665104866028,
294
+ "eval_runtime": 202.8476,
295
+ "eval_samples_per_second": 126.647,
296
+ "eval_steps_per_second": 7.917,
297
+ "eval_total": 25690,
298
+ "step": 900
299
+ },
300
+ {
301
+ "epoch": 0.1476072094468614,
302
+ "grad_norm": 11.602773666381836,
303
+ "learning_rate": 9.985270354257304e-06,
304
+ "loss": 0.7253,
305
+ "step": 950
306
+ },
307
+ {
308
+ "epoch": 0.15537600994406464,
309
+ "grad_norm": 9.150772094726562,
310
+ "learning_rate": 9.984493474207582e-06,
311
+ "loss": 0.6676,
312
+ "step": 1000
313
+ },
314
+ {
315
+ "epoch": 0.15537600994406464,
316
+ "eval_HasAns_exact": 72.28493577267419,
317
+ "eval_HasAns_f1": 78.8876551339714,
318
+ "eval_HasAns_total": 25690,
319
+ "eval_best_exact": 72.28493577267419,
320
+ "eval_best_exact_thresh": 0.0,
321
+ "eval_best_f1": 78.8876551339714,
322
+ "eval_best_f1_thresh": 0.0,
323
+ "eval_exact": 72.28493577267419,
324
+ "eval_f1": 78.8876551339714,
325
+ "eval_loss": 0.5816648602485657,
326
+ "eval_runtime": 202.8184,
327
+ "eval_samples_per_second": 126.665,
328
+ "eval_steps_per_second": 7.918,
329
+ "eval_total": 25690,
330
+ "step": 1000
331
+ },
332
+ {
333
+ "epoch": 0.16314481044126786,
334
+ "grad_norm": 10.605375289916992,
335
+ "learning_rate": 9.983716594157864e-06,
336
+ "loss": 0.7131,
337
+ "step": 1050
338
+ },
339
+ {
340
+ "epoch": 0.1709136109384711,
341
+ "grad_norm": 13.075970649719238,
342
+ "learning_rate": 9.982939714108144e-06,
343
+ "loss": 0.6664,
344
+ "step": 1100
345
+ },
346
+ {
347
+ "epoch": 0.1709136109384711,
348
+ "eval_HasAns_exact": 71.95406773063449,
349
+ "eval_HasAns_f1": 78.6814885378308,
350
+ "eval_HasAns_total": 25690,
351
+ "eval_best_exact": 71.95406773063449,
352
+ "eval_best_exact_thresh": 0.0,
353
+ "eval_best_f1": 78.6814885378308,
354
+ "eval_best_f1_thresh": 0.0,
355
+ "eval_exact": 71.95406773063449,
356
+ "eval_f1": 78.6814885378308,
357
+ "eval_loss": 0.5695982575416565,
358
+ "eval_runtime": 202.5931,
359
+ "eval_samples_per_second": 126.806,
360
+ "eval_steps_per_second": 7.927,
361
+ "eval_total": 25690,
362
+ "step": 1100
363
+ },
364
+ {
365
+ "epoch": 0.17868241143567434,
366
+ "grad_norm": 12.333536148071289,
367
+ "learning_rate": 9.982162834058422e-06,
368
+ "loss": 0.6016,
369
+ "step": 1150
370
+ },
371
+ {
372
+ "epoch": 0.18645121193287756,
373
+ "grad_norm": 9.748809814453125,
374
+ "learning_rate": 9.981385954008702e-06,
375
+ "loss": 0.6006,
376
+ "step": 1200
377
+ },
378
+ {
379
+ "epoch": 0.18645121193287756,
380
+ "eval_HasAns_exact": 72.06305955624757,
381
+ "eval_HasAns_f1": 78.75337714295185,
382
+ "eval_HasAns_total": 25690,
383
+ "eval_best_exact": 72.06305955624757,
384
+ "eval_best_exact_thresh": 0.0,
385
+ "eval_best_f1": 78.75337714295185,
386
+ "eval_best_f1_thresh": 0.0,
387
+ "eval_exact": 72.06305955624757,
388
+ "eval_f1": 78.75337714295185,
389
+ "eval_loss": 0.5660755038261414,
390
+ "eval_runtime": 202.8834,
391
+ "eval_samples_per_second": 126.624,
392
+ "eval_steps_per_second": 7.916,
393
+ "eval_total": 25690,
394
+ "step": 1200
395
+ },
396
+ {
397
+ "epoch": 0.1942200124300808,
398
+ "grad_norm": 9.964409828186035,
399
+ "learning_rate": 9.980609073958982e-06,
400
+ "loss": 0.6268,
401
+ "step": 1250
402
+ },
403
+ {
404
+ "epoch": 0.20198881292728402,
405
+ "grad_norm": 6.387030124664307,
406
+ "learning_rate": 9.979832193909262e-06,
407
+ "loss": 0.6111,
408
+ "step": 1300
409
+ },
410
+ {
411
+ "epoch": 0.20198881292728402,
412
+ "eval_HasAns_exact": 72.65862203191904,
413
+ "eval_HasAns_f1": 79.23512118883761,
414
+ "eval_HasAns_total": 25690,
415
+ "eval_best_exact": 72.65862203191904,
416
+ "eval_best_exact_thresh": 0.0,
417
+ "eval_best_f1": 79.23512118883761,
418
+ "eval_best_f1_thresh": 0.0,
419
+ "eval_exact": 72.65862203191904,
420
+ "eval_f1": 79.23512118883761,
421
+ "eval_loss": 0.5586764216423035,
422
+ "eval_runtime": 203.0862,
423
+ "eval_samples_per_second": 126.498,
424
+ "eval_steps_per_second": 7.908,
425
+ "eval_total": 25690,
426
+ "step": 1300
427
+ },
428
+ {
429
+ "epoch": 0.20975761342448726,
430
+ "grad_norm": 9.103731155395508,
431
+ "learning_rate": 9.979055313859542e-06,
432
+ "loss": 0.566,
433
+ "step": 1350
434
+ },
435
+ {
436
+ "epoch": 0.2175264139216905,
437
+ "grad_norm": 13.135197639465332,
438
+ "learning_rate": 9.97827843380982e-06,
439
+ "loss": 0.5793,
440
+ "step": 1400
441
+ },
442
+ {
443
+ "epoch": 0.2175264139216905,
444
+ "eval_HasAns_exact": 72.39392759828728,
445
+ "eval_HasAns_f1": 79.00515970901382,
446
+ "eval_HasAns_total": 25690,
447
+ "eval_best_exact": 72.39392759828728,
448
+ "eval_best_exact_thresh": 0.0,
449
+ "eval_best_f1": 79.00515970901382,
450
+ "eval_best_f1_thresh": 0.0,
451
+ "eval_exact": 72.39392759828728,
452
+ "eval_f1": 79.00515970901382,
453
+ "eval_loss": 0.5600055456161499,
454
+ "eval_runtime": 202.5804,
455
+ "eval_samples_per_second": 126.814,
456
+ "eval_steps_per_second": 7.928,
457
+ "eval_total": 25690,
458
+ "step": 1400
459
+ },
460
+ {
461
+ "epoch": 0.22529521441889372,
462
+ "grad_norm": 9.040102005004883,
463
+ "learning_rate": 9.9775015537601e-06,
464
+ "loss": 0.6309,
465
+ "step": 1450
466
+ },
467
+ {
468
+ "epoch": 0.23306401491609696,
469
+ "grad_norm": 11.526878356933594,
470
+ "learning_rate": 9.97672467371038e-06,
471
+ "loss": 0.6064,
472
+ "step": 1500
473
+ },
474
+ {
475
+ "epoch": 0.23306401491609696,
476
+ "eval_HasAns_exact": 72.74425846632931,
477
+ "eval_HasAns_f1": 79.36155045560372,
478
+ "eval_HasAns_total": 25690,
479
+ "eval_best_exact": 72.74425846632931,
480
+ "eval_best_exact_thresh": 0.0,
481
+ "eval_best_f1": 79.36155045560372,
482
+ "eval_best_f1_thresh": 0.0,
483
+ "eval_exact": 72.74425846632931,
484
+ "eval_f1": 79.36155045560372,
485
+ "eval_loss": 0.5500572323799133,
486
+ "eval_runtime": 202.9048,
487
+ "eval_samples_per_second": 126.611,
488
+ "eval_steps_per_second": 7.915,
489
+ "eval_total": 25690,
490
+ "step": 1500
491
+ }
492
+ ],
493
+ "logging_steps": 50,
494
+ "max_steps": 643600,
495
+ "num_input_tokens_seen": 0,
496
+ "num_train_epochs": 100,
497
+ "save_steps": 500,
498
+ "stateful_callbacks": {
499
+ "EarlyStoppingCallback": {
500
+ "args": {
501
+ "early_stopping_patience": 10,
502
+ "early_stopping_threshold": 0.0
503
+ },
504
+ "attributes": {
505
+ "early_stopping_patience_counter": 0
506
+ }
507
+ },
508
+ "TrainerControl": {
509
+ "args": {
510
+ "should_epoch_stop": false,
511
+ "should_evaluate": false,
512
+ "should_log": false,
513
+ "should_save": true,
514
+ "should_training_stop": false
515
+ },
516
+ "attributes": {}
517
+ }
518
+ },
519
+ "total_flos": 1.2542244323328e+16,
520
+ "train_batch_size": 16,
521
+ "trial_name": null,
522
+ "trial_params": null
523
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6670cd309bc79c4638de77be66649432fb3a049d09959fd300cbc983c7c7160d
3
+ size 5304
checkpoint-1500/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "language": "english",
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "name": "Bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 30522
27
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab6e779400893356b64fd6c6878843a02872e859352b7ea9771af23ef27e5dec
3
+ size 435596088
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:794a28f346fb8d4167151f7684b737964ac1ec3c38428a462137bbee9f32d6a9
3
+ size 871311930
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c739be83fa1f3a50eced1baaf28a6293b7420251b4834aa76b330fef386f8a43
3
+ size 14244
checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e70d8e700c15b370a15146b2fea86ef770d6a5dad45912e9295eff40cd0193
3
+ size 988
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:828b5dd5761979bb2a6ab76639bf89d01ac2bc7d013d9600252487ed2a1577d2
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 79.36155045560372,
4
+ "best_model_checkpoint": "bert-soccer-qa/checkpoint-1500",
5
+ "epoch": 0.3107520198881293,
6
+ "eval_steps": 100,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007768800497203232,
14
+ "grad_norm": 9.272425651550293,
15
+ "learning_rate": 9.99925419515227e-06,
16
+ "loss": 1.1446,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.015537600994406464,
21
+ "grad_norm": 12.237289428710938,
22
+ "learning_rate": 9.99847731510255e-06,
23
+ "loss": 0.9455,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.015537600994406464,
28
+ "eval_HasAns_exact": 69.85208252238225,
29
+ "eval_HasAns_f1": 77.13603651720207,
30
+ "eval_HasAns_total": 25690,
31
+ "eval_best_exact": 69.85208252238225,
32
+ "eval_best_exact_thresh": 0.0,
33
+ "eval_best_f1": 77.13603651720207,
34
+ "eval_best_f1_thresh": 0.0,
35
+ "eval_exact": 69.85208252238225,
36
+ "eval_f1": 77.13603651720207,
37
+ "eval_loss": 0.8127343058586121,
38
+ "eval_runtime": 202.0338,
39
+ "eval_samples_per_second": 127.157,
40
+ "eval_steps_per_second": 7.949,
41
+ "eval_total": 25690,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023306401491609695,
46
+ "grad_norm": 5.626998424530029,
47
+ "learning_rate": 9.997700435052828e-06,
48
+ "loss": 0.8129,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.031075201988812928,
53
+ "grad_norm": 6.87526798248291,
54
+ "learning_rate": 9.996923555003108e-06,
55
+ "loss": 0.8743,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.031075201988812928,
60
+ "eval_HasAns_exact": 70.21409108602569,
61
+ "eval_HasAns_f1": 77.38207884007946,
62
+ "eval_HasAns_total": 25690,
63
+ "eval_best_exact": 70.21409108602569,
64
+ "eval_best_exact_thresh": 0.0,
65
+ "eval_best_f1": 77.38207884007946,
66
+ "eval_best_f1_thresh": 0.0,
67
+ "eval_exact": 70.21409108602569,
68
+ "eval_f1": 77.38207884007946,
69
+ "eval_loss": 0.7383215427398682,
70
+ "eval_runtime": 202.5157,
71
+ "eval_samples_per_second": 126.854,
72
+ "eval_steps_per_second": 7.93,
73
+ "eval_total": 25690,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03884400248601616,
78
+ "grad_norm": 11.461019515991211,
79
+ "learning_rate": 9.996146674953388e-06,
80
+ "loss": 0.7825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.04661280298321939,
85
+ "grad_norm": 9.630631446838379,
86
+ "learning_rate": 9.995369794903668e-06,
87
+ "loss": 0.7189,
88
+ "step": 300
89
+ },
90
+ {
91
+ "epoch": 0.04661280298321939,
92
+ "eval_HasAns_exact": 71.03931490852472,
93
+ "eval_HasAns_f1": 78.08897109574,
94
+ "eval_HasAns_total": 25690,
95
+ "eval_best_exact": 71.03931490852472,
96
+ "eval_best_exact_thresh": 0.0,
97
+ "eval_best_f1": 78.08897109574,
98
+ "eval_best_f1_thresh": 0.0,
99
+ "eval_exact": 71.03931490852472,
100
+ "eval_f1": 78.08897109574,
101
+ "eval_loss": 0.7194859981536865,
102
+ "eval_runtime": 202.3679,
103
+ "eval_samples_per_second": 126.947,
104
+ "eval_steps_per_second": 7.936,
105
+ "eval_total": 25690,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.054381603480422626,
110
+ "grad_norm": 11.377346992492676,
111
+ "learning_rate": 9.994592914853948e-06,
112
+ "loss": 0.7889,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.062150403977625855,
117
+ "grad_norm": 7.450821399688721,
118
+ "learning_rate": 9.993816034804228e-06,
119
+ "loss": 0.7367,
120
+ "step": 400
121
+ },
122
+ {
123
+ "epoch": 0.062150403977625855,
124
+ "eval_HasAns_exact": 71.08991825613079,
125
+ "eval_HasAns_f1": 78.00619214091435,
126
+ "eval_HasAns_total": 25690,
127
+ "eval_best_exact": 71.08991825613079,
128
+ "eval_best_exact_thresh": 0.0,
129
+ "eval_best_f1": 78.00619214091435,
130
+ "eval_best_f1_thresh": 0.0,
131
+ "eval_exact": 71.08991825613079,
132
+ "eval_f1": 78.00619214091435,
133
+ "eval_loss": 0.683600902557373,
134
+ "eval_runtime": 202.4728,
135
+ "eval_samples_per_second": 126.881,
136
+ "eval_steps_per_second": 7.932,
137
+ "eval_total": 25690,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.06991920447482909,
142
+ "grad_norm": 14.92029857635498,
143
+ "learning_rate": 9.993039154754508e-06,
144
+ "loss": 0.7838,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.07768800497203232,
149
+ "grad_norm": 9.030844688415527,
150
+ "learning_rate": 9.992262274704786e-06,
151
+ "loss": 0.6469,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.07768800497203232,
156
+ "eval_HasAns_exact": 71.42467886337096,
157
+ "eval_HasAns_f1": 78.23334576784337,
158
+ "eval_HasAns_total": 25690,
159
+ "eval_best_exact": 71.42467886337096,
160
+ "eval_best_exact_thresh": 0.0,
161
+ "eval_best_f1": 78.23334576784337,
162
+ "eval_best_f1_thresh": 0.0,
163
+ "eval_exact": 71.42467886337096,
164
+ "eval_f1": 78.23334576784337,
165
+ "eval_loss": 0.6646179556846619,
166
+ "eval_runtime": 202.6333,
167
+ "eval_samples_per_second": 126.781,
168
+ "eval_steps_per_second": 7.926,
169
+ "eval_total": 25690,
170
+ "step": 500
171
+ },
172
+ {
173
+ "epoch": 0.08545680546923555,
174
+ "grad_norm": 10.103166580200195,
175
+ "learning_rate": 9.991485394655066e-06,
176
+ "loss": 0.725,
177
+ "step": 550
178
+ },
179
+ {
180
+ "epoch": 0.09322560596643878,
181
+ "grad_norm": 6.986274242401123,
182
+ "learning_rate": 9.990708514605346e-06,
183
+ "loss": 0.6657,
184
+ "step": 600
185
+ },
186
+ {
187
+ "epoch": 0.09322560596643878,
188
+ "eval_HasAns_exact": 70.2841572596341,
189
+ "eval_HasAns_f1": 77.1247889416155,
190
+ "eval_HasAns_total": 25690,
191
+ "eval_best_exact": 70.2841572596341,
192
+ "eval_best_exact_thresh": 0.0,
193
+ "eval_best_f1": 77.1247889416155,
194
+ "eval_best_f1_thresh": 0.0,
195
+ "eval_exact": 70.2841572596341,
196
+ "eval_f1": 77.1247889416155,
197
+ "eval_loss": 0.6492825150489807,
198
+ "eval_runtime": 203.3854,
199
+ "eval_samples_per_second": 126.312,
200
+ "eval_steps_per_second": 7.896,
201
+ "eval_total": 25690,
202
+ "step": 600
203
+ },
204
+ {
205
+ "epoch": 0.10099440646364201,
206
+ "grad_norm": 10.636602401733398,
207
+ "learning_rate": 9.989931634555626e-06,
208
+ "loss": 0.7337,
209
+ "step": 650
210
+ },
211
+ {
212
+ "epoch": 0.10876320696084525,
213
+ "grad_norm": 8.252824783325195,
214
+ "learning_rate": 9.989154754505906e-06,
215
+ "loss": 0.662,
216
+ "step": 700
217
+ },
218
+ {
219
+ "epoch": 0.10876320696084525,
220
+ "eval_HasAns_exact": 72.22654729466718,
221
+ "eval_HasAns_f1": 79.0143805614001,
222
+ "eval_HasAns_total": 25690,
223
+ "eval_best_exact": 72.22654729466718,
224
+ "eval_best_exact_thresh": 0.0,
225
+ "eval_best_f1": 79.0143805614001,
226
+ "eval_best_f1_thresh": 0.0,
227
+ "eval_exact": 72.22654729466718,
228
+ "eval_f1": 79.0143805614001,
229
+ "eval_loss": 0.6340453028678894,
230
+ "eval_runtime": 202.8216,
231
+ "eval_samples_per_second": 126.663,
232
+ "eval_steps_per_second": 7.918,
233
+ "eval_total": 25690,
234
+ "step": 700
235
+ },
236
+ {
237
+ "epoch": 0.11653200745804848,
238
+ "grad_norm": 8.463785171508789,
239
+ "learning_rate": 9.988377874456184e-06,
240
+ "loss": 0.7265,
241
+ "step": 750
242
+ },
243
+ {
244
+ "epoch": 0.12430080795525171,
245
+ "grad_norm": 9.748174667358398,
246
+ "learning_rate": 9.987600994406464e-06,
247
+ "loss": 0.6969,
248
+ "step": 800
249
+ },
250
+ {
251
+ "epoch": 0.12430080795525171,
252
+ "eval_HasAns_exact": 72.1292331646555,
253
+ "eval_HasAns_f1": 78.83892105701732,
254
+ "eval_HasAns_total": 25690,
255
+ "eval_best_exact": 72.1292331646555,
256
+ "eval_best_exact_thresh": 0.0,
257
+ "eval_best_f1": 78.83892105701732,
258
+ "eval_best_f1_thresh": 0.0,
259
+ "eval_exact": 72.1292331646555,
260
+ "eval_f1": 78.83892105701732,
261
+ "eval_loss": 0.6085864901542664,
262
+ "eval_runtime": 202.3194,
263
+ "eval_samples_per_second": 126.977,
264
+ "eval_steps_per_second": 7.938,
265
+ "eval_total": 25690,
266
+ "step": 800
267
+ },
268
+ {
269
+ "epoch": 0.13206960845245494,
270
+ "grad_norm": 9.786517143249512,
271
+ "learning_rate": 9.986824114356744e-06,
272
+ "loss": 0.7111,
273
+ "step": 850
274
+ },
275
+ {
276
+ "epoch": 0.13983840894965818,
277
+ "grad_norm": 8.391840934753418,
278
+ "learning_rate": 9.986047234307024e-06,
279
+ "loss": 0.669,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.13983840894965818,
284
+ "eval_HasAns_exact": 71.89567925262749,
285
+ "eval_HasAns_f1": 78.61371443349637,
286
+ "eval_HasAns_total": 25690,
287
+ "eval_best_exact": 71.89567925262749,
288
+ "eval_best_exact_thresh": 0.0,
289
+ "eval_best_f1": 78.61371443349637,
290
+ "eval_best_f1_thresh": 0.0,
291
+ "eval_exact": 71.89567925262749,
292
+ "eval_f1": 78.61371443349637,
293
+ "eval_loss": 0.5937665104866028,
294
+ "eval_runtime": 202.8476,
295
+ "eval_samples_per_second": 126.647,
296
+ "eval_steps_per_second": 7.917,
297
+ "eval_total": 25690,
298
+ "step": 900
299
+ },
300
+ {
301
+ "epoch": 0.1476072094468614,
302
+ "grad_norm": 11.602773666381836,
303
+ "learning_rate": 9.985270354257304e-06,
304
+ "loss": 0.7253,
305
+ "step": 950
306
+ },
307
+ {
308
+ "epoch": 0.15537600994406464,
309
+ "grad_norm": 9.150772094726562,
310
+ "learning_rate": 9.984493474207582e-06,
311
+ "loss": 0.6676,
312
+ "step": 1000
313
+ },
314
+ {
315
+ "epoch": 0.15537600994406464,
316
+ "eval_HasAns_exact": 72.28493577267419,
317
+ "eval_HasAns_f1": 78.8876551339714,
318
+ "eval_HasAns_total": 25690,
319
+ "eval_best_exact": 72.28493577267419,
320
+ "eval_best_exact_thresh": 0.0,
321
+ "eval_best_f1": 78.8876551339714,
322
+ "eval_best_f1_thresh": 0.0,
323
+ "eval_exact": 72.28493577267419,
324
+ "eval_f1": 78.8876551339714,
325
+ "eval_loss": 0.5816648602485657,
326
+ "eval_runtime": 202.8184,
327
+ "eval_samples_per_second": 126.665,
328
+ "eval_steps_per_second": 7.918,
329
+ "eval_total": 25690,
330
+ "step": 1000
331
+ },
332
+ {
333
+ "epoch": 0.16314481044126786,
334
+ "grad_norm": 10.605375289916992,
335
+ "learning_rate": 9.983716594157864e-06,
336
+ "loss": 0.7131,
337
+ "step": 1050
338
+ },
339
+ {
340
+ "epoch": 0.1709136109384711,
341
+ "grad_norm": 13.075970649719238,
342
+ "learning_rate": 9.982939714108144e-06,
343
+ "loss": 0.6664,
344
+ "step": 1100
345
+ },
346
+ {
347
+ "epoch": 0.1709136109384711,
348
+ "eval_HasAns_exact": 71.95406773063449,
349
+ "eval_HasAns_f1": 78.6814885378308,
350
+ "eval_HasAns_total": 25690,
351
+ "eval_best_exact": 71.95406773063449,
352
+ "eval_best_exact_thresh": 0.0,
353
+ "eval_best_f1": 78.6814885378308,
354
+ "eval_best_f1_thresh": 0.0,
355
+ "eval_exact": 71.95406773063449,
356
+ "eval_f1": 78.6814885378308,
357
+ "eval_loss": 0.5695982575416565,
358
+ "eval_runtime": 202.5931,
359
+ "eval_samples_per_second": 126.806,
360
+ "eval_steps_per_second": 7.927,
361
+ "eval_total": 25690,
362
+ "step": 1100
363
+ },
364
+ {
365
+ "epoch": 0.17868241143567434,
366
+ "grad_norm": 12.333536148071289,
367
+ "learning_rate": 9.982162834058422e-06,
368
+ "loss": 0.6016,
369
+ "step": 1150
370
+ },
371
+ {
372
+ "epoch": 0.18645121193287756,
373
+ "grad_norm": 9.748809814453125,
374
+ "learning_rate": 9.981385954008702e-06,
375
+ "loss": 0.6006,
376
+ "step": 1200
377
+ },
378
+ {
379
+ "epoch": 0.18645121193287756,
380
+ "eval_HasAns_exact": 72.06305955624757,
381
+ "eval_HasAns_f1": 78.75337714295185,
382
+ "eval_HasAns_total": 25690,
383
+ "eval_best_exact": 72.06305955624757,
384
+ "eval_best_exact_thresh": 0.0,
385
+ "eval_best_f1": 78.75337714295185,
386
+ "eval_best_f1_thresh": 0.0,
387
+ "eval_exact": 72.06305955624757,
388
+ "eval_f1": 78.75337714295185,
389
+ "eval_loss": 0.5660755038261414,
390
+ "eval_runtime": 202.8834,
391
+ "eval_samples_per_second": 126.624,
392
+ "eval_steps_per_second": 7.916,
393
+ "eval_total": 25690,
394
+ "step": 1200
395
+ },
396
+ {
397
+ "epoch": 0.1942200124300808,
398
+ "grad_norm": 9.964409828186035,
399
+ "learning_rate": 9.980609073958982e-06,
400
+ "loss": 0.6268,
401
+ "step": 1250
402
+ },
403
+ {
404
+ "epoch": 0.20198881292728402,
405
+ "grad_norm": 6.387030124664307,
406
+ "learning_rate": 9.979832193909262e-06,
407
+ "loss": 0.6111,
408
+ "step": 1300
409
+ },
410
+ {
411
+ "epoch": 0.20198881292728402,
412
+ "eval_HasAns_exact": 72.65862203191904,
413
+ "eval_HasAns_f1": 79.23512118883761,
414
+ "eval_HasAns_total": 25690,
415
+ "eval_best_exact": 72.65862203191904,
416
+ "eval_best_exact_thresh": 0.0,
417
+ "eval_best_f1": 79.23512118883761,
418
+ "eval_best_f1_thresh": 0.0,
419
+ "eval_exact": 72.65862203191904,
420
+ "eval_f1": 79.23512118883761,
421
+ "eval_loss": 0.5586764216423035,
422
+ "eval_runtime": 203.0862,
423
+ "eval_samples_per_second": 126.498,
424
+ "eval_steps_per_second": 7.908,
425
+ "eval_total": 25690,
426
+ "step": 1300
427
+ },
428
+ {
429
+ "epoch": 0.20975761342448726,
430
+ "grad_norm": 9.103731155395508,
431
+ "learning_rate": 9.979055313859542e-06,
432
+ "loss": 0.566,
433
+ "step": 1350
434
+ },
435
+ {
436
+ "epoch": 0.2175264139216905,
437
+ "grad_norm": 13.135197639465332,
438
+ "learning_rate": 9.97827843380982e-06,
439
+ "loss": 0.5793,
440
+ "step": 1400
441
+ },
442
+ {
443
+ "epoch": 0.2175264139216905,
444
+ "eval_HasAns_exact": 72.39392759828728,
445
+ "eval_HasAns_f1": 79.00515970901382,
446
+ "eval_HasAns_total": 25690,
447
+ "eval_best_exact": 72.39392759828728,
448
+ "eval_best_exact_thresh": 0.0,
449
+ "eval_best_f1": 79.00515970901382,
450
+ "eval_best_f1_thresh": 0.0,
451
+ "eval_exact": 72.39392759828728,
452
+ "eval_f1": 79.00515970901382,
453
+ "eval_loss": 0.5600055456161499,
454
+ "eval_runtime": 202.5804,
455
+ "eval_samples_per_second": 126.814,
456
+ "eval_steps_per_second": 7.928,
457
+ "eval_total": 25690,
458
+ "step": 1400
459
+ },
460
+ {
461
+ "epoch": 0.22529521441889372,
462
+ "grad_norm": 9.040102005004883,
463
+ "learning_rate": 9.9775015537601e-06,
464
+ "loss": 0.6309,
465
+ "step": 1450
466
+ },
467
+ {
468
+ "epoch": 0.23306401491609696,
469
+ "grad_norm": 11.526878356933594,
470
+ "learning_rate": 9.97672467371038e-06,
471
+ "loss": 0.6064,
472
+ "step": 1500
473
+ },
474
+ {
475
+ "epoch": 0.23306401491609696,
476
+ "eval_HasAns_exact": 72.74425846632931,
477
+ "eval_HasAns_f1": 79.36155045560372,
478
+ "eval_HasAns_total": 25690,
479
+ "eval_best_exact": 72.74425846632931,
480
+ "eval_best_exact_thresh": 0.0,
481
+ "eval_best_f1": 79.36155045560372,
482
+ "eval_best_f1_thresh": 0.0,
483
+ "eval_exact": 72.74425846632931,
484
+ "eval_f1": 79.36155045560372,
485
+ "eval_loss": 0.5500572323799133,
486
+ "eval_runtime": 202.9048,
487
+ "eval_samples_per_second": 126.611,
488
+ "eval_steps_per_second": 7.915,
489
+ "eval_total": 25690,
490
+ "step": 1500
491
+ },
492
+ {
493
+ "epoch": 0.24083281541330018,
494
+ "grad_norm": 4.359344482421875,
495
+ "learning_rate": 9.97594779366066e-06,
496
+ "loss": 0.6823,
497
+ "step": 1550
498
+ },
499
+ {
500
+ "epoch": 0.24860161591050342,
501
+ "grad_norm": 9.813569068908691,
502
+ "learning_rate": 9.97517091361094e-06,
503
+ "loss": 0.6314,
504
+ "step": 1600
505
+ },
506
+ {
507
+ "epoch": 0.24860161591050342,
508
+ "eval_HasAns_exact": 72.27715064227326,
509
+ "eval_HasAns_f1": 78.8175772466993,
510
+ "eval_HasAns_total": 25690,
511
+ "eval_best_exact": 72.27715064227326,
512
+ "eval_best_exact_thresh": 0.0,
513
+ "eval_best_f1": 78.8175772466993,
514
+ "eval_best_f1_thresh": 0.0,
515
+ "eval_exact": 72.27715064227326,
516
+ "eval_f1": 78.8175772466993,
517
+ "eval_loss": 0.5353918075561523,
518
+ "eval_runtime": 202.6516,
519
+ "eval_samples_per_second": 126.769,
520
+ "eval_steps_per_second": 7.925,
521
+ "eval_total": 25690,
522
+ "step": 1600
523
+ },
524
+ {
525
+ "epoch": 0.25637041640770664,
526
+ "grad_norm": 7.802361965179443,
527
+ "learning_rate": 9.974394033561218e-06,
528
+ "loss": 0.6026,
529
+ "step": 1650
530
+ },
531
+ {
532
+ "epoch": 0.2641392169049099,
533
+ "grad_norm": 15.137731552124023,
534
+ "learning_rate": 9.973617153511498e-06,
535
+ "loss": 0.6741,
536
+ "step": 1700
537
+ },
538
+ {
539
+ "epoch": 0.2641392169049099,
540
+ "eval_HasAns_exact": 72.1058777734527,
541
+ "eval_HasAns_f1": 78.67773271200794,
542
+ "eval_HasAns_total": 25690,
543
+ "eval_best_exact": 72.1058777734527,
544
+ "eval_best_exact_thresh": 0.0,
545
+ "eval_best_f1": 78.67773271200794,
546
+ "eval_best_f1_thresh": 0.0,
547
+ "eval_exact": 72.1058777734527,
548
+ "eval_f1": 78.67773271200794,
549
+ "eval_loss": 0.5329614877700806,
550
+ "eval_runtime": 202.7192,
551
+ "eval_samples_per_second": 126.727,
552
+ "eval_steps_per_second": 7.922,
553
+ "eval_total": 25690,
554
+ "step": 1700
555
+ },
556
+ {
557
+ "epoch": 0.2719080174021131,
558
+ "grad_norm": 11.30902099609375,
559
+ "learning_rate": 9.972840273461778e-06,
560
+ "loss": 0.6069,
561
+ "step": 1750
562
+ },
563
+ {
564
+ "epoch": 0.27967681789931637,
565
+ "grad_norm": 9.311911582946777,
566
+ "learning_rate": 9.972063393412058e-06,
567
+ "loss": 0.5912,
568
+ "step": 1800
569
+ },
570
+ {
571
+ "epoch": 0.27967681789931637,
572
+ "eval_HasAns_exact": 72.24990268586998,
573
+ "eval_HasAns_f1": 78.7891482246884,
574
+ "eval_HasAns_total": 25690,
575
+ "eval_best_exact": 72.24990268586998,
576
+ "eval_best_exact_thresh": 0.0,
577
+ "eval_best_f1": 78.7891482246884,
578
+ "eval_best_f1_thresh": 0.0,
579
+ "eval_exact": 72.24990268586998,
580
+ "eval_f1": 78.7891482246884,
581
+ "eval_loss": 0.5291240811347961,
582
+ "eval_runtime": 202.8091,
583
+ "eval_samples_per_second": 126.671,
584
+ "eval_steps_per_second": 7.919,
585
+ "eval_total": 25690,
586
+ "step": 1800
587
+ },
588
+ {
589
+ "epoch": 0.28744561839651955,
590
+ "grad_norm": 9.725021362304688,
591
+ "learning_rate": 9.971286513362338e-06,
592
+ "loss": 0.5943,
593
+ "step": 1850
594
+ },
595
+ {
596
+ "epoch": 0.2952144188937228,
597
+ "grad_norm": 8.321113586425781,
598
+ "learning_rate": 9.970509633312618e-06,
599
+ "loss": 0.584,
600
+ "step": 1900
601
+ },
602
+ {
603
+ "epoch": 0.2952144188937228,
604
+ "eval_HasAns_exact": 72.56909303230829,
605
+ "eval_HasAns_f1": 79.12955371544794,
606
+ "eval_HasAns_total": 25690,
607
+ "eval_best_exact": 72.56909303230829,
608
+ "eval_best_exact_thresh": 0.0,
609
+ "eval_best_f1": 79.12955371544794,
610
+ "eval_best_f1_thresh": 0.0,
611
+ "eval_exact": 72.56909303230829,
612
+ "eval_f1": 79.12955371544794,
613
+ "eval_loss": 0.5198299288749695,
614
+ "eval_runtime": 202.5651,
615
+ "eval_samples_per_second": 126.823,
616
+ "eval_steps_per_second": 7.928,
617
+ "eval_total": 25690,
618
+ "step": 1900
619
+ },
620
+ {
621
+ "epoch": 0.30298321939092604,
622
+ "grad_norm": 11.973769187927246,
623
+ "learning_rate": 9.969732753262898e-06,
624
+ "loss": 0.5895,
625
+ "step": 1950
626
+ },
627
+ {
628
+ "epoch": 0.3107520198881293,
629
+ "grad_norm": 9.40645980834961,
630
+ "learning_rate": 9.968955873213176e-06,
631
+ "loss": 0.64,
632
+ "step": 2000
633
+ },
634
+ {
635
+ "epoch": 0.3107520198881293,
636
+ "eval_HasAns_exact": 72.79096924873491,
637
+ "eval_HasAns_f1": 79.28733698122942,
638
+ "eval_HasAns_total": 25690,
639
+ "eval_best_exact": 72.79096924873491,
640
+ "eval_best_exact_thresh": 0.0,
641
+ "eval_best_f1": 79.28733698122942,
642
+ "eval_best_f1_thresh": 0.0,
643
+ "eval_exact": 72.79096924873491,
644
+ "eval_f1": 79.28733698122942,
645
+ "eval_loss": 0.5117060542106628,
646
+ "eval_runtime": 203.1079,
647
+ "eval_samples_per_second": 126.485,
648
+ "eval_steps_per_second": 7.907,
649
+ "eval_total": 25690,
650
+ "step": 2000
651
+ }
652
+ ],
653
+ "logging_steps": 50,
654
+ "max_steps": 643600,
655
+ "num_input_tokens_seen": 0,
656
+ "num_train_epochs": 100,
657
+ "save_steps": 500,
658
+ "stateful_callbacks": {
659
+ "EarlyStoppingCallback": {
660
+ "args": {
661
+ "early_stopping_patience": 10,
662
+ "early_stopping_threshold": 0.0
663
+ },
664
+ "attributes": {
665
+ "early_stopping_patience_counter": 5
666
+ }
667
+ },
668
+ "TrainerControl": {
669
+ "args": {
670
+ "should_epoch_stop": false,
671
+ "should_evaluate": false,
672
+ "should_log": false,
673
+ "should_save": true,
674
+ "should_training_stop": false
675
+ },
676
+ "attributes": {}
677
+ }
678
+ },
679
+ "total_flos": 1.6722992431104e+16,
680
+ "train_batch_size": 16,
681
+ "trial_name": null,
682
+ "trial_params": null
683
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6670cd309bc79c4638de77be66649432fb3a049d09959fd300cbc983c7c7160d
3
+ size 5304
checkpoint-2000/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2500/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "language": "english",
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "name": "Bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 30522
27
+ }
checkpoint-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34b5ac3b298f089970fd5faa229f4e624022da482e3ed94e91fd321c508c86ed
3
+ size 435596088
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daf4067887ac289ee1cb25161d18e7a9b07789033905e88e16b492c53d559a46
3
+ size 871311930
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac6ce6758a41be9c411ebc19f7bc46befaf0c549f6a057a6116e3dce102cc27
3
+ size 14244
checkpoint-2500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c42f07c52d5e58052c8ab9dd9e6ca80449cf9199ddded4d95cf9bf3e369b2da7
3
+ size 988
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8007a77b9380d5b0e9793dad6d7cce01a2173bd308ed9eec6014186473ea217a
3
+ size 1064
checkpoint-2500/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
checkpoint-2500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2500/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2500,
3
+ "best_metric": 79.95807844595133,
4
+ "best_model_checkpoint": "bert-soccer-qa/checkpoint-2500",
5
+ "epoch": 0.3884400248601616,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007768800497203232,
14
+ "grad_norm": 9.272425651550293,
15
+ "learning_rate": 9.99925419515227e-06,
16
+ "loss": 1.1446,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.015537600994406464,
21
+ "grad_norm": 12.237289428710938,
22
+ "learning_rate": 9.99847731510255e-06,
23
+ "loss": 0.9455,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.015537600994406464,
28
+ "eval_HasAns_exact": 69.85208252238225,
29
+ "eval_HasAns_f1": 77.13603651720207,
30
+ "eval_HasAns_total": 25690,
31
+ "eval_best_exact": 69.85208252238225,
32
+ "eval_best_exact_thresh": 0.0,
33
+ "eval_best_f1": 77.13603651720207,
34
+ "eval_best_f1_thresh": 0.0,
35
+ "eval_exact": 69.85208252238225,
36
+ "eval_f1": 77.13603651720207,
37
+ "eval_loss": 0.8127343058586121,
38
+ "eval_runtime": 202.0338,
39
+ "eval_samples_per_second": 127.157,
40
+ "eval_steps_per_second": 7.949,
41
+ "eval_total": 25690,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023306401491609695,
46
+ "grad_norm": 5.626998424530029,
47
+ "learning_rate": 9.997700435052828e-06,
48
+ "loss": 0.8129,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.031075201988812928,
53
+ "grad_norm": 6.87526798248291,
54
+ "learning_rate": 9.996923555003108e-06,
55
+ "loss": 0.8743,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.031075201988812928,
60
+ "eval_HasAns_exact": 70.21409108602569,
61
+ "eval_HasAns_f1": 77.38207884007946,
62
+ "eval_HasAns_total": 25690,
63
+ "eval_best_exact": 70.21409108602569,
64
+ "eval_best_exact_thresh": 0.0,
65
+ "eval_best_f1": 77.38207884007946,
66
+ "eval_best_f1_thresh": 0.0,
67
+ "eval_exact": 70.21409108602569,
68
+ "eval_f1": 77.38207884007946,
69
+ "eval_loss": 0.7383215427398682,
70
+ "eval_runtime": 202.5157,
71
+ "eval_samples_per_second": 126.854,
72
+ "eval_steps_per_second": 7.93,
73
+ "eval_total": 25690,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03884400248601616,
78
+ "grad_norm": 11.461019515991211,
79
+ "learning_rate": 9.996146674953388e-06,
80
+ "loss": 0.7825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.04661280298321939,
85
+ "grad_norm": 9.630631446838379,
86
+ "learning_rate": 9.995369794903668e-06,
87
+ "loss": 0.7189,
88
+ "step": 300
89
+ },
90
+ {
91
+ "epoch": 0.04661280298321939,
92
+ "eval_HasAns_exact": 71.03931490852472,
93
+ "eval_HasAns_f1": 78.08897109574,
94
+ "eval_HasAns_total": 25690,
95
+ "eval_best_exact": 71.03931490852472,
96
+ "eval_best_exact_thresh": 0.0,
97
+ "eval_best_f1": 78.08897109574,
98
+ "eval_best_f1_thresh": 0.0,
99
+ "eval_exact": 71.03931490852472,
100
+ "eval_f1": 78.08897109574,
101
+ "eval_loss": 0.7194859981536865,
102
+ "eval_runtime": 202.3679,
103
+ "eval_samples_per_second": 126.947,
104
+ "eval_steps_per_second": 7.936,
105
+ "eval_total": 25690,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.054381603480422626,
110
+ "grad_norm": 11.377346992492676,
111
+ "learning_rate": 9.994592914853948e-06,
112
+ "loss": 0.7889,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.062150403977625855,
117
+ "grad_norm": 7.450821399688721,
118
+ "learning_rate": 9.993816034804228e-06,
119
+ "loss": 0.7367,
120
+ "step": 400
121
+ },
122
+ {
123
+ "epoch": 0.062150403977625855,
124
+ "eval_HasAns_exact": 71.08991825613079,
125
+ "eval_HasAns_f1": 78.00619214091435,
126
+ "eval_HasAns_total": 25690,
127
+ "eval_best_exact": 71.08991825613079,
128
+ "eval_best_exact_thresh": 0.0,
129
+ "eval_best_f1": 78.00619214091435,
130
+ "eval_best_f1_thresh": 0.0,
131
+ "eval_exact": 71.08991825613079,
132
+ "eval_f1": 78.00619214091435,
133
+ "eval_loss": 0.683600902557373,
134
+ "eval_runtime": 202.4728,
135
+ "eval_samples_per_second": 126.881,
136
+ "eval_steps_per_second": 7.932,
137
+ "eval_total": 25690,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.06991920447482909,
142
+ "grad_norm": 14.92029857635498,
143
+ "learning_rate": 9.993039154754508e-06,
144
+ "loss": 0.7838,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.07768800497203232,
149
+ "grad_norm": 9.030844688415527,
150
+ "learning_rate": 9.992262274704786e-06,
151
+ "loss": 0.6469,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.07768800497203232,
156
+ "eval_HasAns_exact": 71.42467886337096,
157
+ "eval_HasAns_f1": 78.23334576784337,
158
+ "eval_HasAns_total": 25690,
159
+ "eval_best_exact": 71.42467886337096,
160
+ "eval_best_exact_thresh": 0.0,
161
+ "eval_best_f1": 78.23334576784337,
162
+ "eval_best_f1_thresh": 0.0,
163
+ "eval_exact": 71.42467886337096,
164
+ "eval_f1": 78.23334576784337,
165
+ "eval_loss": 0.6646179556846619,
166
+ "eval_runtime": 202.6333,
167
+ "eval_samples_per_second": 126.781,
168
+ "eval_steps_per_second": 7.926,
169
+ "eval_total": 25690,
170
+ "step": 500
171
+ },
172
+ {
173
+ "epoch": 0.08545680546923555,
174
+ "grad_norm": 10.103166580200195,
175
+ "learning_rate": 9.991485394655066e-06,
176
+ "loss": 0.725,
177
+ "step": 550
178
+ },
179
+ {
180
+ "epoch": 0.09322560596643878,
181
+ "grad_norm": 6.986274242401123,
182
+ "learning_rate": 9.990708514605346e-06,
183
+ "loss": 0.6657,
184
+ "step": 600
185
+ },
186
+ {
187
+ "epoch": 0.09322560596643878,
188
+ "eval_HasAns_exact": 70.2841572596341,
189
+ "eval_HasAns_f1": 77.1247889416155,
190
+ "eval_HasAns_total": 25690,
191
+ "eval_best_exact": 70.2841572596341,
192
+ "eval_best_exact_thresh": 0.0,
193
+ "eval_best_f1": 77.1247889416155,
194
+ "eval_best_f1_thresh": 0.0,
195
+ "eval_exact": 70.2841572596341,
196
+ "eval_f1": 77.1247889416155,
197
+ "eval_loss": 0.6492825150489807,
198
+ "eval_runtime": 203.3854,
199
+ "eval_samples_per_second": 126.312,
200
+ "eval_steps_per_second": 7.896,
201
+ "eval_total": 25690,
202
+ "step": 600
203
+ },
204
+ {
205
+ "epoch": 0.10099440646364201,
206
+ "grad_norm": 10.636602401733398,
207
+ "learning_rate": 9.989931634555626e-06,
208
+ "loss": 0.7337,
209
+ "step": 650
210
+ },
211
+ {
212
+ "epoch": 0.10876320696084525,
213
+ "grad_norm": 8.252824783325195,
214
+ "learning_rate": 9.989154754505906e-06,
215
+ "loss": 0.662,
216
+ "step": 700
217
+ },
218
+ {
219
+ "epoch": 0.10876320696084525,
220
+ "eval_HasAns_exact": 72.22654729466718,
221
+ "eval_HasAns_f1": 79.0143805614001,
222
+ "eval_HasAns_total": 25690,
223
+ "eval_best_exact": 72.22654729466718,
224
+ "eval_best_exact_thresh": 0.0,
225
+ "eval_best_f1": 79.0143805614001,
226
+ "eval_best_f1_thresh": 0.0,
227
+ "eval_exact": 72.22654729466718,
228
+ "eval_f1": 79.0143805614001,
229
+ "eval_loss": 0.6340453028678894,
230
+ "eval_runtime": 202.8216,
231
+ "eval_samples_per_second": 126.663,
232
+ "eval_steps_per_second": 7.918,
233
+ "eval_total": 25690,
234
+ "step": 700
235
+ },
236
+ {
237
+ "epoch": 0.11653200745804848,
238
+ "grad_norm": 8.463785171508789,
239
+ "learning_rate": 9.988377874456184e-06,
240
+ "loss": 0.7265,
241
+ "step": 750
242
+ },
243
+ {
244
+ "epoch": 0.12430080795525171,
245
+ "grad_norm": 9.748174667358398,
246
+ "learning_rate": 9.987600994406464e-06,
247
+ "loss": 0.6969,
248
+ "step": 800
249
+ },
250
+ {
251
+ "epoch": 0.12430080795525171,
252
+ "eval_HasAns_exact": 72.1292331646555,
253
+ "eval_HasAns_f1": 78.83892105701732,
254
+ "eval_HasAns_total": 25690,
255
+ "eval_best_exact": 72.1292331646555,
256
+ "eval_best_exact_thresh": 0.0,
257
+ "eval_best_f1": 78.83892105701732,
258
+ "eval_best_f1_thresh": 0.0,
259
+ "eval_exact": 72.1292331646555,
260
+ "eval_f1": 78.83892105701732,
261
+ "eval_loss": 0.6085864901542664,
262
+ "eval_runtime": 202.3194,
263
+ "eval_samples_per_second": 126.977,
264
+ "eval_steps_per_second": 7.938,
265
+ "eval_total": 25690,
266
+ "step": 800
267
+ },
268
+ {
269
+ "epoch": 0.13206960845245494,
270
+ "grad_norm": 9.786517143249512,
271
+ "learning_rate": 9.986824114356744e-06,
272
+ "loss": 0.7111,
273
+ "step": 850
274
+ },
275
+ {
276
+ "epoch": 0.13983840894965818,
277
+ "grad_norm": 8.391840934753418,
278
+ "learning_rate": 9.986047234307024e-06,
279
+ "loss": 0.669,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.13983840894965818,
284
+ "eval_HasAns_exact": 71.89567925262749,
285
+ "eval_HasAns_f1": 78.61371443349637,
286
+ "eval_HasAns_total": 25690,
287
+ "eval_best_exact": 71.89567925262749,
288
+ "eval_best_exact_thresh": 0.0,
289
+ "eval_best_f1": 78.61371443349637,
290
+ "eval_best_f1_thresh": 0.0,
291
+ "eval_exact": 71.89567925262749,
292
+ "eval_f1": 78.61371443349637,
293
+ "eval_loss": 0.5937665104866028,
294
+ "eval_runtime": 202.8476,
295
+ "eval_samples_per_second": 126.647,
296
+ "eval_steps_per_second": 7.917,
297
+ "eval_total": 25690,
298
+ "step": 900
299
+ },
300
+ {
301
+ "epoch": 0.1476072094468614,
302
+ "grad_norm": 11.602773666381836,
303
+ "learning_rate": 9.985270354257304e-06,
304
+ "loss": 0.7253,
305
+ "step": 950
306
+ },
307
+ {
308
+ "epoch": 0.15537600994406464,
309
+ "grad_norm": 9.150772094726562,
310
+ "learning_rate": 9.984493474207582e-06,
311
+ "loss": 0.6676,
312
+ "step": 1000
313
+ },
314
+ {
315
+ "epoch": 0.15537600994406464,
316
+ "eval_HasAns_exact": 72.28493577267419,
317
+ "eval_HasAns_f1": 78.8876551339714,
318
+ "eval_HasAns_total": 25690,
319
+ "eval_best_exact": 72.28493577267419,
320
+ "eval_best_exact_thresh": 0.0,
321
+ "eval_best_f1": 78.8876551339714,
322
+ "eval_best_f1_thresh": 0.0,
323
+ "eval_exact": 72.28493577267419,
324
+ "eval_f1": 78.8876551339714,
325
+ "eval_loss": 0.5816648602485657,
326
+ "eval_runtime": 202.8184,
327
+ "eval_samples_per_second": 126.665,
328
+ "eval_steps_per_second": 7.918,
329
+ "eval_total": 25690,
330
+ "step": 1000
331
+ },
332
+ {
333
+ "epoch": 0.16314481044126786,
334
+ "grad_norm": 10.605375289916992,
335
+ "learning_rate": 9.983716594157864e-06,
336
+ "loss": 0.7131,
337
+ "step": 1050
338
+ },
339
+ {
340
+ "epoch": 0.1709136109384711,
341
+ "grad_norm": 13.075970649719238,
342
+ "learning_rate": 9.982939714108144e-06,
343
+ "loss": 0.6664,
344
+ "step": 1100
345
+ },
346
+ {
347
+ "epoch": 0.1709136109384711,
348
+ "eval_HasAns_exact": 71.95406773063449,
349
+ "eval_HasAns_f1": 78.6814885378308,
350
+ "eval_HasAns_total": 25690,
351
+ "eval_best_exact": 71.95406773063449,
352
+ "eval_best_exact_thresh": 0.0,
353
+ "eval_best_f1": 78.6814885378308,
354
+ "eval_best_f1_thresh": 0.0,
355
+ "eval_exact": 71.95406773063449,
356
+ "eval_f1": 78.6814885378308,
357
+ "eval_loss": 0.5695982575416565,
358
+ "eval_runtime": 202.5931,
359
+ "eval_samples_per_second": 126.806,
360
+ "eval_steps_per_second": 7.927,
361
+ "eval_total": 25690,
362
+ "step": 1100
363
+ },
364
+ {
365
+ "epoch": 0.17868241143567434,
366
+ "grad_norm": 12.333536148071289,
367
+ "learning_rate": 9.982162834058422e-06,
368
+ "loss": 0.6016,
369
+ "step": 1150
370
+ },
371
+ {
372
+ "epoch": 0.18645121193287756,
373
+ "grad_norm": 9.748809814453125,
374
+ "learning_rate": 9.981385954008702e-06,
375
+ "loss": 0.6006,
376
+ "step": 1200
377
+ },
378
+ {
379
+ "epoch": 0.18645121193287756,
380
+ "eval_HasAns_exact": 72.06305955624757,
381
+ "eval_HasAns_f1": 78.75337714295185,
382
+ "eval_HasAns_total": 25690,
383
+ "eval_best_exact": 72.06305955624757,
384
+ "eval_best_exact_thresh": 0.0,
385
+ "eval_best_f1": 78.75337714295185,
386
+ "eval_best_f1_thresh": 0.0,
387
+ "eval_exact": 72.06305955624757,
388
+ "eval_f1": 78.75337714295185,
389
+ "eval_loss": 0.5660755038261414,
390
+ "eval_runtime": 202.8834,
391
+ "eval_samples_per_second": 126.624,
392
+ "eval_steps_per_second": 7.916,
393
+ "eval_total": 25690,
394
+ "step": 1200
395
+ },
396
+ {
397
+ "epoch": 0.1942200124300808,
398
+ "grad_norm": 9.964409828186035,
399
+ "learning_rate": 9.980609073958982e-06,
400
+ "loss": 0.6268,
401
+ "step": 1250
402
+ },
403
+ {
404
+ "epoch": 0.20198881292728402,
405
+ "grad_norm": 6.387030124664307,
406
+ "learning_rate": 9.979832193909262e-06,
407
+ "loss": 0.6111,
408
+ "step": 1300
409
+ },
410
+ {
411
+ "epoch": 0.20198881292728402,
412
+ "eval_HasAns_exact": 72.65862203191904,
413
+ "eval_HasAns_f1": 79.23512118883761,
414
+ "eval_HasAns_total": 25690,
415
+ "eval_best_exact": 72.65862203191904,
416
+ "eval_best_exact_thresh": 0.0,
417
+ "eval_best_f1": 79.23512118883761,
418
+ "eval_best_f1_thresh": 0.0,
419
+ "eval_exact": 72.65862203191904,
420
+ "eval_f1": 79.23512118883761,
421
+ "eval_loss": 0.5586764216423035,
422
+ "eval_runtime": 203.0862,
423
+ "eval_samples_per_second": 126.498,
424
+ "eval_steps_per_second": 7.908,
425
+ "eval_total": 25690,
426
+ "step": 1300
427
+ },
428
+ {
429
+ "epoch": 0.20975761342448726,
430
+ "grad_norm": 9.103731155395508,
431
+ "learning_rate": 9.979055313859542e-06,
432
+ "loss": 0.566,
433
+ "step": 1350
434
+ },
435
+ {
436
+ "epoch": 0.2175264139216905,
437
+ "grad_norm": 13.135197639465332,
438
+ "learning_rate": 9.97827843380982e-06,
439
+ "loss": 0.5793,
440
+ "step": 1400
441
+ },
442
+ {
443
+ "epoch": 0.2175264139216905,
444
+ "eval_HasAns_exact": 72.39392759828728,
445
+ "eval_HasAns_f1": 79.00515970901382,
446
+ "eval_HasAns_total": 25690,
447
+ "eval_best_exact": 72.39392759828728,
448
+ "eval_best_exact_thresh": 0.0,
449
+ "eval_best_f1": 79.00515970901382,
450
+ "eval_best_f1_thresh": 0.0,
451
+ "eval_exact": 72.39392759828728,
452
+ "eval_f1": 79.00515970901382,
453
+ "eval_loss": 0.5600055456161499,
454
+ "eval_runtime": 202.5804,
455
+ "eval_samples_per_second": 126.814,
456
+ "eval_steps_per_second": 7.928,
457
+ "eval_total": 25690,
458
+ "step": 1400
459
+ },
460
+ {
461
+ "epoch": 0.22529521441889372,
462
+ "grad_norm": 9.040102005004883,
463
+ "learning_rate": 9.9775015537601e-06,
464
+ "loss": 0.6309,
465
+ "step": 1450
466
+ },
467
+ {
468
+ "epoch": 0.23306401491609696,
469
+ "grad_norm": 11.526878356933594,
470
+ "learning_rate": 9.97672467371038e-06,
471
+ "loss": 0.6064,
472
+ "step": 1500
473
+ },
474
+ {
475
+ "epoch": 0.23306401491609696,
476
+ "eval_HasAns_exact": 72.74425846632931,
477
+ "eval_HasAns_f1": 79.36155045560372,
478
+ "eval_HasAns_total": 25690,
479
+ "eval_best_exact": 72.74425846632931,
480
+ "eval_best_exact_thresh": 0.0,
481
+ "eval_best_f1": 79.36155045560372,
482
+ "eval_best_f1_thresh": 0.0,
483
+ "eval_exact": 72.74425846632931,
484
+ "eval_f1": 79.36155045560372,
485
+ "eval_loss": 0.5500572323799133,
486
+ "eval_runtime": 202.9048,
487
+ "eval_samples_per_second": 126.611,
488
+ "eval_steps_per_second": 7.915,
489
+ "eval_total": 25690,
490
+ "step": 1500
491
+ },
492
+ {
493
+ "epoch": 0.24083281541330018,
494
+ "grad_norm": 4.359344482421875,
495
+ "learning_rate": 9.97594779366066e-06,
496
+ "loss": 0.6823,
497
+ "step": 1550
498
+ },
499
+ {
500
+ "epoch": 0.24860161591050342,
501
+ "grad_norm": 9.813569068908691,
502
+ "learning_rate": 9.97517091361094e-06,
503
+ "loss": 0.6314,
504
+ "step": 1600
505
+ },
506
+ {
507
+ "epoch": 0.24860161591050342,
508
+ "eval_HasAns_exact": 72.27715064227326,
509
+ "eval_HasAns_f1": 78.8175772466993,
510
+ "eval_HasAns_total": 25690,
511
+ "eval_best_exact": 72.27715064227326,
512
+ "eval_best_exact_thresh": 0.0,
513
+ "eval_best_f1": 78.8175772466993,
514
+ "eval_best_f1_thresh": 0.0,
515
+ "eval_exact": 72.27715064227326,
516
+ "eval_f1": 78.8175772466993,
517
+ "eval_loss": 0.5353918075561523,
518
+ "eval_runtime": 202.6516,
519
+ "eval_samples_per_second": 126.769,
520
+ "eval_steps_per_second": 7.925,
521
+ "eval_total": 25690,
522
+ "step": 1600
523
+ },
524
+ {
525
+ "epoch": 0.25637041640770664,
526
+ "grad_norm": 7.802361965179443,
527
+ "learning_rate": 9.974394033561218e-06,
528
+ "loss": 0.6026,
529
+ "step": 1650
530
+ },
531
+ {
532
+ "epoch": 0.2641392169049099,
533
+ "grad_norm": 15.137731552124023,
534
+ "learning_rate": 9.973617153511498e-06,
535
+ "loss": 0.6741,
536
+ "step": 1700
537
+ },
538
+ {
539
+ "epoch": 0.2641392169049099,
540
+ "eval_HasAns_exact": 72.1058777734527,
541
+ "eval_HasAns_f1": 78.67773271200794,
542
+ "eval_HasAns_total": 25690,
543
+ "eval_best_exact": 72.1058777734527,
544
+ "eval_best_exact_thresh": 0.0,
545
+ "eval_best_f1": 78.67773271200794,
546
+ "eval_best_f1_thresh": 0.0,
547
+ "eval_exact": 72.1058777734527,
548
+ "eval_f1": 78.67773271200794,
549
+ "eval_loss": 0.5329614877700806,
550
+ "eval_runtime": 202.7192,
551
+ "eval_samples_per_second": 126.727,
552
+ "eval_steps_per_second": 7.922,
553
+ "eval_total": 25690,
554
+ "step": 1700
555
+ },
556
+ {
557
+ "epoch": 0.2719080174021131,
558
+ "grad_norm": 11.30902099609375,
559
+ "learning_rate": 9.972840273461778e-06,
560
+ "loss": 0.6069,
561
+ "step": 1750
562
+ },
563
+ {
564
+ "epoch": 0.27967681789931637,
565
+ "grad_norm": 9.311911582946777,
566
+ "learning_rate": 9.972063393412058e-06,
567
+ "loss": 0.5912,
568
+ "step": 1800
569
+ },
570
+ {
571
+ "epoch": 0.27967681789931637,
572
+ "eval_HasAns_exact": 72.24990268586998,
573
+ "eval_HasAns_f1": 78.7891482246884,
574
+ "eval_HasAns_total": 25690,
575
+ "eval_best_exact": 72.24990268586998,
576
+ "eval_best_exact_thresh": 0.0,
577
+ "eval_best_f1": 78.7891482246884,
578
+ "eval_best_f1_thresh": 0.0,
579
+ "eval_exact": 72.24990268586998,
580
+ "eval_f1": 78.7891482246884,
581
+ "eval_loss": 0.5291240811347961,
582
+ "eval_runtime": 202.8091,
583
+ "eval_samples_per_second": 126.671,
584
+ "eval_steps_per_second": 7.919,
585
+ "eval_total": 25690,
586
+ "step": 1800
587
+ },
588
+ {
589
+ "epoch": 0.28744561839651955,
590
+ "grad_norm": 9.725021362304688,
591
+ "learning_rate": 9.971286513362338e-06,
592
+ "loss": 0.5943,
593
+ "step": 1850
594
+ },
595
+ {
596
+ "epoch": 0.2952144188937228,
597
+ "grad_norm": 8.321113586425781,
598
+ "learning_rate": 9.970509633312618e-06,
599
+ "loss": 0.584,
600
+ "step": 1900
601
+ },
602
+ {
603
+ "epoch": 0.2952144188937228,
604
+ "eval_HasAns_exact": 72.56909303230829,
605
+ "eval_HasAns_f1": 79.12955371544794,
606
+ "eval_HasAns_total": 25690,
607
+ "eval_best_exact": 72.56909303230829,
608
+ "eval_best_exact_thresh": 0.0,
609
+ "eval_best_f1": 79.12955371544794,
610
+ "eval_best_f1_thresh": 0.0,
611
+ "eval_exact": 72.56909303230829,
612
+ "eval_f1": 79.12955371544794,
613
+ "eval_loss": 0.5198299288749695,
614
+ "eval_runtime": 202.5651,
615
+ "eval_samples_per_second": 126.823,
616
+ "eval_steps_per_second": 7.928,
617
+ "eval_total": 25690,
618
+ "step": 1900
619
+ },
620
+ {
621
+ "epoch": 0.30298321939092604,
622
+ "grad_norm": 11.973769187927246,
623
+ "learning_rate": 9.969732753262898e-06,
624
+ "loss": 0.5895,
625
+ "step": 1950
626
+ },
627
+ {
628
+ "epoch": 0.3107520198881293,
629
+ "grad_norm": 9.40645980834961,
630
+ "learning_rate": 9.968955873213176e-06,
631
+ "loss": 0.64,
632
+ "step": 2000
633
+ },
634
+ {
635
+ "epoch": 0.3107520198881293,
636
+ "eval_HasAns_exact": 72.79096924873491,
637
+ "eval_HasAns_f1": 79.28733698122942,
638
+ "eval_HasAns_total": 25690,
639
+ "eval_best_exact": 72.79096924873491,
640
+ "eval_best_exact_thresh": 0.0,
641
+ "eval_best_f1": 79.28733698122942,
642
+ "eval_best_f1_thresh": 0.0,
643
+ "eval_exact": 72.79096924873491,
644
+ "eval_f1": 79.28733698122942,
645
+ "eval_loss": 0.5117060542106628,
646
+ "eval_runtime": 203.1079,
647
+ "eval_samples_per_second": 126.485,
648
+ "eval_steps_per_second": 7.907,
649
+ "eval_total": 25690,
650
+ "step": 2000
651
+ },
652
+ {
653
+ "epoch": 0.3185208203853325,
654
+ "grad_norm": 14.14991569519043,
655
+ "learning_rate": 9.968178993163456e-06,
656
+ "loss": 0.5559,
657
+ "step": 2050
658
+ },
659
+ {
660
+ "epoch": 0.3262896208825357,
661
+ "grad_norm": 10.641693115234375,
662
+ "learning_rate": 9.967402113113736e-06,
663
+ "loss": 0.5361,
664
+ "step": 2100
665
+ },
666
+ {
667
+ "epoch": 0.3262896208825357,
668
+ "eval_HasAns_exact": 73.13351498637603,
669
+ "eval_HasAns_f1": 79.60268173326928,
670
+ "eval_HasAns_total": 25690,
671
+ "eval_best_exact": 73.13351498637603,
672
+ "eval_best_exact_thresh": 0.0,
673
+ "eval_best_f1": 79.60268173326928,
674
+ "eval_best_f1_thresh": 0.0,
675
+ "eval_exact": 73.13351498637603,
676
+ "eval_f1": 79.60268173326928,
677
+ "eval_loss": 0.5079160928726196,
678
+ "eval_runtime": 202.7995,
679
+ "eval_samples_per_second": 126.677,
680
+ "eval_steps_per_second": 7.919,
681
+ "eval_total": 25690,
682
+ "step": 2100
683
+ },
684
+ {
685
+ "epoch": 0.33405842137973896,
686
+ "grad_norm": 10.088223457336426,
687
+ "learning_rate": 9.966625233064016e-06,
688
+ "loss": 0.6151,
689
+ "step": 2150
690
+ },
691
+ {
692
+ "epoch": 0.3418272218769422,
693
+ "grad_norm": 10.863611221313477,
694
+ "learning_rate": 9.965848353014296e-06,
695
+ "loss": 0.5935,
696
+ "step": 2200
697
+ },
698
+ {
699
+ "epoch": 0.3418272218769422,
700
+ "eval_HasAns_exact": 72.9349941611522,
701
+ "eval_HasAns_f1": 79.49240914807372,
702
+ "eval_HasAns_total": 25690,
703
+ "eval_best_exact": 72.9349941611522,
704
+ "eval_best_exact_thresh": 0.0,
705
+ "eval_best_f1": 79.49240914807372,
706
+ "eval_best_f1_thresh": 0.0,
707
+ "eval_exact": 72.9349941611522,
708
+ "eval_f1": 79.49240914807372,
709
+ "eval_loss": 0.5025383234024048,
710
+ "eval_runtime": 203.3956,
711
+ "eval_samples_per_second": 126.306,
712
+ "eval_steps_per_second": 7.896,
713
+ "eval_total": 25690,
714
+ "step": 2200
715
+ },
716
+ {
717
+ "epoch": 0.34959602237414544,
718
+ "grad_norm": 12.565064430236816,
719
+ "learning_rate": 9.965071472964574e-06,
720
+ "loss": 0.5667,
721
+ "step": 2250
722
+ },
723
+ {
724
+ "epoch": 0.3573648228713487,
725
+ "grad_norm": 5.066905975341797,
726
+ "learning_rate": 9.964294592914854e-06,
727
+ "loss": 0.5198,
728
+ "step": 2300
729
+ },
730
+ {
731
+ "epoch": 0.3573648228713487,
732
+ "eval_HasAns_exact": 72.6975476839237,
733
+ "eval_HasAns_f1": 79.25327470362464,
734
+ "eval_HasAns_total": 25690,
735
+ "eval_best_exact": 72.6975476839237,
736
+ "eval_best_exact_thresh": 0.0,
737
+ "eval_best_f1": 79.25327470362464,
738
+ "eval_best_f1_thresh": 0.0,
739
+ "eval_exact": 72.6975476839237,
740
+ "eval_f1": 79.25327470362464,
741
+ "eval_loss": 0.4996239244937897,
742
+ "eval_runtime": 204.0149,
743
+ "eval_samples_per_second": 125.922,
744
+ "eval_steps_per_second": 7.872,
745
+ "eval_total": 25690,
746
+ "step": 2300
747
+ },
748
+ {
749
+ "epoch": 0.3651336233685519,
750
+ "grad_norm": 10.777655601501465,
751
+ "learning_rate": 9.963517712865134e-06,
752
+ "loss": 0.4983,
753
+ "step": 2350
754
+ },
755
+ {
756
+ "epoch": 0.3729024238657551,
757
+ "grad_norm": 6.770049095153809,
758
+ "learning_rate": 9.962740832815414e-06,
759
+ "loss": 0.5474,
760
+ "step": 2400
761
+ },
762
+ {
763
+ "epoch": 0.3729024238657551,
764
+ "eval_HasAns_exact": 73.29700272479565,
765
+ "eval_HasAns_f1": 79.7561576366954,
766
+ "eval_HasAns_total": 25690,
767
+ "eval_best_exact": 73.29700272479565,
768
+ "eval_best_exact_thresh": 0.0,
769
+ "eval_best_f1": 79.7561576366954,
770
+ "eval_best_f1_thresh": 0.0,
771
+ "eval_exact": 73.29700272479565,
772
+ "eval_f1": 79.7561576366954,
773
+ "eval_loss": 0.4912045896053314,
774
+ "eval_runtime": 202.5387,
775
+ "eval_samples_per_second": 126.84,
776
+ "eval_steps_per_second": 7.929,
777
+ "eval_total": 25690,
778
+ "step": 2400
779
+ },
780
+ {
781
+ "epoch": 0.38067122436295836,
782
+ "grad_norm": 11.886155128479004,
783
+ "learning_rate": 9.961963952765694e-06,
784
+ "loss": 0.5038,
785
+ "step": 2450
786
+ },
787
+ {
788
+ "epoch": 0.3884400248601616,
789
+ "grad_norm": 15.60519027709961,
790
+ "learning_rate": 9.961187072715973e-06,
791
+ "loss": 0.5655,
792
+ "step": 2500
793
+ },
794
+ {
795
+ "epoch": 0.3884400248601616,
796
+ "eval_HasAns_exact": 73.46049046321527,
797
+ "eval_HasAns_f1": 79.95807844595133,
798
+ "eval_HasAns_total": 25690,
799
+ "eval_best_exact": 73.46049046321527,
800
+ "eval_best_exact_thresh": 0.0,
801
+ "eval_best_f1": 79.95807844595133,
802
+ "eval_best_f1_thresh": 0.0,
803
+ "eval_exact": 73.46049046321527,
804
+ "eval_f1": 79.95807844595133,
805
+ "eval_loss": 0.48474493622779846,
806
+ "eval_runtime": 202.6752,
807
+ "eval_samples_per_second": 126.755,
808
+ "eval_steps_per_second": 7.924,
809
+ "eval_total": 25690,
810
+ "step": 2500
811
+ }
812
+ ],
813
+ "logging_steps": 50,
814
+ "max_steps": 643600,
815
+ "num_input_tokens_seen": 0,
816
+ "num_train_epochs": 100,
817
+ "save_steps": 500,
818
+ "stateful_callbacks": {
819
+ "EarlyStoppingCallback": {
820
+ "args": {
821
+ "early_stopping_patience": 10,
822
+ "early_stopping_threshold": 0.0
823
+ },
824
+ "attributes": {
825
+ "early_stopping_patience_counter": 0
826
+ }
827
+ },
828
+ "TrainerControl": {
829
+ "args": {
830
+ "should_epoch_stop": false,
831
+ "should_evaluate": false,
832
+ "should_log": false,
833
+ "should_save": true,
834
+ "should_training_stop": false
835
+ },
836
+ "attributes": {}
837
+ }
838
+ },
839
+ "total_flos": 2.090374053888e+16,
840
+ "train_batch_size": 16,
841
+ "trial_name": null,
842
+ "trial_params": null
843
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6670cd309bc79c4638de77be66649432fb3a049d09959fd300cbc983c7c7160d
3
+ size 5304
checkpoint-2500/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3000/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "language": "english",
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "name": "Bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.50.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 30522
27
+ }
checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d9f01f2fbc8837fbfff5ff3be61c2957266444df3282cd389ac793a2ad3306
3
+ size 435596088