Added missing normalizer to tokenizer
Browse files- tokenizer.json +7 -0
tokenizer.json
CHANGED
|
@@ -76,6 +76,13 @@
|
|
| 76 |
"String": "<foreign>"
|
| 77 |
},
|
| 78 |
"content": "UNK"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
}
|
| 80 |
]
|
| 81 |
},
|
|
|
|
| 76 |
"String": "<foreign>"
|
| 77 |
},
|
| 78 |
"content": "UNK"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"type": "Replace",
|
| 82 |
+
"pattern": {
|
| 83 |
+
"Regex": "[^-\u0000--‿₠-₿∀-⋿⅐-↋ff-ﭏ]+"
|
| 84 |
+
},
|
| 85 |
+
"content": "UNK"
|
| 86 |
}
|
| 87 |
]
|
| 88 |
},
|