| import spacy | |
| from spacy.tokenizer import Tokenizer | |
| def create_custom_tokenizer(): | |
| def create_tokenizer(nlp): | |
| infixes = nlp.Defaults.infixes + [ | |
| r"/", | |
| r"-", | |
| r",", | |
| r":", | |
| ] | |
| prefixes = nlp.Defaults.prefixes + [ | |
| r"-", | |
| ] | |
| prefix_regex = spacy.util.compile_prefix_regex(prefixes) | |
| infix_regex = spacy.util.compile_infix_regex(infixes) | |
| return Tokenizer( | |
| nlp.vocab, | |
| prefix_search=prefix_regex.search, | |
| infix_finditer=infix_regex.finditer, | |
| ) | |
| return create_tokenizer | |