Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # Copyright 2017-present, Facebook, Inc. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """Regex based tokenizer that emulates the Stanford/NLTK PTB tokenizers. | |
| However it is purely in Python, supports robust untokenization, unicode, | |
| and requires minimal dependencies. | |
| """ | |
| import regex | |
| import logging | |
| from .tokenizer import Tokens, Tokenizer | |
| logger = logging.getLogger(__name__) | |
| class RegexpTokenizer(Tokenizer): | |
| DIGIT = r'\p{Nd}+([:\.\,]\p{Nd}+)*' | |
| TITLE = (r'(dr|esq|hon|jr|mr|mrs|ms|prof|rev|sr|st|rt|messrs|mmes|msgr)' | |
| r'\.(?=\p{Z})') | |
| ABBRV = r'([\p{L}]\.){2,}(?=\p{Z}|$)' | |
| ALPHA_NUM = r'[\p{L}\p{N}\p{M}]++' | |
| HYPHEN = r'{A}([-\u058A\u2010\u2011]{A})+'.format(A=ALPHA_NUM) | |
| NEGATION = r"((?!n't)[\p{L}\p{N}\p{M}])++(?=n't)|n't" | |
| CONTRACTION1 = r"can(?=not\b)" | |
| CONTRACTION2 = r"'([tsdm]|re|ll|ve)\b" | |
| START_DQUOTE = r'(?<=[\p{Z}\(\[{<]|^)(``|["\u0093\u201C\u00AB])(?!\p{Z})' | |
| START_SQUOTE = r'(?<=[\p{Z}\(\[{<]|^)[\'\u0091\u2018\u201B\u2039](?!\p{Z})' | |
| END_DQUOTE = r'(?<!\p{Z})(\'\'|["\u0094\u201D\u00BB])' | |
| END_SQUOTE = r'(?<!\p{Z})[\'\u0092\u2019\u203A]' | |
| DASH = r'--|[\u0096\u0097\u2013\u2014\u2015]' | |
| ELLIPSES = r'\.\.\.|\u2026' | |
| PUNCT = r'\p{P}' | |
| NON_WS = r'[^\p{Z}\p{C}]' | |
| def __init__(self, **kwargs): | |
| """ | |
| Args: | |
| annotators: None or empty set (only tokenizes). | |
| substitutions: if true, normalizes some token types (e.g. quotes). | |
| """ | |
| self._regexp = regex.compile( | |
| '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' | |
| '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' | |
| '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' | |
| '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % | |
| (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, | |
| self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, | |
| self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, | |
| self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, | |
| self.NON_WS), | |
| flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE | |
| ) | |
| if len(kwargs.get('annotators', {})) > 0: | |
| logger.warning('%s only tokenizes! Skipping annotators: %s' % | |
| (type(self).__name__, kwargs.get('annotators'))) | |
| self.annotators = set() | |
| self.substitutions = kwargs.get('substitutions', True) | |
| def tokenize(self, text): | |
| data = [] | |
| matches = [m for m in self._regexp.finditer(text)] | |
| for i in range(len(matches)): | |
| # Get text | |
| token = matches[i].group() | |
| # Make normalizations for special token types | |
| if self.substitutions: | |
| groups = matches[i].groupdict() | |
| if groups['sdquote']: | |
| token = "``" | |
| elif groups['edquote']: | |
| token = "''" | |
| elif groups['ssquote']: | |
| token = "`" | |
| elif groups['esquote']: | |
| token = "'" | |
| elif groups['dash']: | |
| token = '--' | |
| elif groups['ellipses']: | |
| token = '...' | |
| # Get whitespace | |
| span = matches[i].span() | |
| start_ws = span[0] | |
| if i + 1 < len(matches): | |
| end_ws = matches[i + 1].span()[0] | |
| else: | |
| end_ws = span[1] | |
| # Format data | |
| data.append(( | |
| token, | |
| text[start_ws: end_ws], | |
| span, | |
| )) | |
| return Tokens(data, self.annotators) | |