| | import collections |
| | import re |
| | from enum import Enum |
| |
|
| | import six |
| |
|
| | _DEF_PUNCS = ';:,.!?¡¿—…"«»“”' |
| |
|
| | _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"]) |
| |
|
| |
|
| | class PuncPosition(Enum): |
| | """Enum for the punctuations positions""" |
| |
|
| | BEGIN = 0 |
| | END = 1 |
| | MIDDLE = 2 |
| | ALONE = 3 |
| |
|
| |
|
| | class Punctuation: |
| | """Handle punctuations in text. |
| | |
| | Just strip punctuations from text or strip and restore them later. |
| | |
| | Args: |
| | puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`. |
| | |
| | Example: |
| | >>> punc = Punctuation() |
| | >>> punc.strip("This is. example !") |
| | 'This is example' |
| | |
| | >>> text_striped, punc_map = punc.strip_to_restore("This is. example !") |
| | >>> ' '.join(text_striped) |
| | 'This is example' |
| | |
| | >>> text_restored = punc.restore(text_striped, punc_map) |
| | >>> text_restored[0] |
| | 'This is. example !' |
| | """ |
| |
|
| | def __init__(self, puncs: str = _DEF_PUNCS): |
| | self.puncs = puncs |
| |
|
| | @staticmethod |
| | def default_puncs(): |
| | """Return default set of punctuations.""" |
| | return _DEF_PUNCS |
| |
|
| | @property |
| | def puncs(self): |
| | return self._puncs |
| |
|
| | @puncs.setter |
| | def puncs(self, value): |
| | if not isinstance(value, six.string_types): |
| | raise ValueError("[!] Punctuations must be of type str.") |
| | self._puncs = "".join(list(dict.fromkeys(list(value)))) |
| | self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+") |
| |
|
| | def strip(self, text): |
| | """Remove all the punctuations by replacing with `space`. |
| | |
| | Args: |
| | text (str): The text to be processed. |
| | |
| | Example:: |
| | |
| | "This is. example !" -> "This is example " |
| | """ |
| | return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip() |
| |
|
| | def strip_to_restore(self, text): |
| | """Remove punctuations from text to restore them later. |
| | |
| | Args: |
| | text (str): The text to be processed. |
| | |
| | Examples :: |
| | |
| | "This is. example !" -> [["This is", "example"], [".", "!"]] |
| | |
| | """ |
| | text, puncs = self._strip_to_restore(text) |
| | return text, puncs |
| |
|
| | def _strip_to_restore(self, text): |
| | """Auxiliary method for Punctuation.preserve()""" |
| | matches = list(re.finditer(self.puncs_regular_exp, text)) |
| | if not matches: |
| | return [text], [] |
| | |
| | if len(matches) == 1 and matches[0].group() == text: |
| | return [], [_PUNC_IDX(text, PuncPosition.ALONE)] |
| | |
| | puncs = [] |
| | for match in matches: |
| | position = PuncPosition.MIDDLE |
| | if match == matches[0] and text.startswith(match.group()): |
| | position = PuncPosition.BEGIN |
| | elif match == matches[-1] and text.endswith(match.group()): |
| | position = PuncPosition.END |
| | puncs.append(_PUNC_IDX(match.group(), position)) |
| | |
| | splitted_text = [] |
| | for idx, punc in enumerate(puncs): |
| | split = text.split(punc.punc) |
| | prefix, suffix = split[0], punc.punc.join(split[1:]) |
| | splitted_text.append(prefix) |
| | |
| | if idx == len(puncs) - 1 and len(suffix) > 0: |
| | splitted_text.append(suffix) |
| | text = suffix |
| | while splitted_text[0] == '': |
| | splitted_text = splitted_text[1:] |
| | return splitted_text, puncs |
| |
|
| | @classmethod |
| | def restore(cls, text, puncs): |
| | """Restore punctuation in a text. |
| | |
| | Args: |
| | text (str): The text to be processed. |
| | puncs (List[str]): The list of punctuations map to be used for restoring. |
| | |
| | Examples :: |
| | |
| | ['This is', 'example'], ['.', '!'] -> "This is. example!" |
| | |
| | """ |
| | return cls._restore(text, puncs, 0) |
| |
|
| | @classmethod |
| | def _restore(cls, text, puncs, num): |
| | """Auxiliary method for Punctuation.restore()""" |
| | if not puncs: |
| | return text |
| |
|
| | |
| | if not text: |
| | return ["".join(m.punc for m in puncs)] |
| |
|
| | current = puncs[0] |
| |
|
| | if current.position == PuncPosition.BEGIN: |
| | return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num) |
| |
|
| | if current.position == PuncPosition.END: |
| | return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1) |
| |
|
| | if current.position == PuncPosition.ALONE: |
| | return [current.mark] + cls._restore(text, puncs[1:], num + 1) |
| |
|
| | |
| | if len(text) == 1: |
| | |
| | |
| | return cls._restore([text[0] + current.punc], puncs[1:], num) |
| |
|
| | return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |