Add custom processor
Browse filesAdd a custom processor to parse HTML.
- tokenizer_config.json +3 -0
tokenizer_config.json
CHANGED
|
@@ -68,12 +68,14 @@
|
|
| 68 |
"errors": "replace",
|
| 69 |
"mask_token": "<mask>",
|
| 70 |
"max_depth": 50,
|
|
|
|
| 71 |
"max_width": 1000,
|
| 72 |
"model_max_length": 512,
|
| 73 |
"only_label_first_subword": true,
|
| 74 |
"pad_token": "<pad>",
|
| 75 |
"pad_token_label": -100,
|
| 76 |
"pad_width": 1001,
|
|
|
|
| 77 |
"processor_class": "MarkupLMPhishProcessor",
|
| 78 |
"sep_token": "</s>",
|
| 79 |
"tags_dict": {
|
|
@@ -295,5 +297,6 @@
|
|
| 295 |
},
|
| 296 |
"tokenizer_class": "MarkupLMTokenizer",
|
| 297 |
"trim_offsets": false,
|
|
|
|
| 298 |
"unk_token": "<unk>"
|
| 299 |
}
|
|
|
|
| 68 |
"errors": "replace",
|
| 69 |
"mask_token": "<mask>",
|
| 70 |
"max_depth": 50,
|
| 71 |
+
"max_length": 512,
|
| 72 |
"max_width": 1000,
|
| 73 |
"model_max_length": 512,
|
| 74 |
"only_label_first_subword": true,
|
| 75 |
"pad_token": "<pad>",
|
| 76 |
"pad_token_label": -100,
|
| 77 |
"pad_width": 1001,
|
| 78 |
+
"padding": "max_length",
|
| 79 |
"processor_class": "MarkupLMPhishProcessor",
|
| 80 |
"sep_token": "</s>",
|
| 81 |
"tags_dict": {
|
|
|
|
| 297 |
},
|
| 298 |
"tokenizer_class": "MarkupLMTokenizer",
|
| 299 |
"trim_offsets": false,
|
| 300 |
+
"truncation": true,
|
| 301 |
"unk_token": "<unk>"
|
| 302 |
}
|