Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # Copyright 2017-present, Facebook, Inc. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """Basic tokenizer that splits text into alpha-numeric tokens and | |
| non-whitespace tokens. | |
| """ | |
| import regex | |
| import logging | |
| from .tokenizer import Tokens, Tokenizer | |
| logger = logging.getLogger(__name__) | |
| class SimpleTokenizer(Tokenizer): | |
| ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' | |
| NON_WS = r'[^\p{Z}\p{C}]' | |
| def __init__(self, **kwargs): | |
| """ | |
| Args: | |
| annotators: None or empty set (only tokenizes). | |
| """ | |
| self._regexp = regex.compile( | |
| '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), | |
| flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE | |
| ) | |
| if len(kwargs.get('annotators', {})) > 0: | |
| logger.warning('%s only tokenizes! Skipping annotators: %s' % | |
| (type(self).__name__, kwargs.get('annotators'))) | |
| self.annotators = set() | |
| def tokenize(self, text): | |
| data = [] | |
| matches = [m for m in self._regexp.finditer(text)] | |
| for i in range(len(matches)): | |
| # Get text | |
| token = matches[i].group() | |
| # Get whitespace | |
| span = matches[i].span() | |
| start_ws = span[0] | |
| if i + 1 < len(matches): | |
| end_ws = matches[i + 1].span()[0] | |
| else: | |
| end_ws = span[1] | |
| # Format data | |
| data.append(( | |
| token, | |
| text[start_ws: end_ws], | |
| span, | |
| )) | |
| return Tokens(data, self.annotators) | |