| import re |
| import numpy as np |
| import sys |
|
|
| |
| np.set_printoptions(threshold=sys.maxsize) |
|
|
|
|
| class Labeler: |
| |
| WHOLE_RAW = 'whole_raw' |
| SENTS_RAW = 'sents_raw' |
|
|
| def __init__(self, tags=(1, 2), |
| regexes=(r'[^\S\r\n\v\f]', r'\u200c'), |
| chars=(" ", ""), |
| class_count=2): |
| self._tags = tags |
| self._regexes = regexes |
| self._class_chars = chars |
| self.class_count = class_count |
|
|
| self.data = None |
| self.labels = [] |
| self.corpus_type = None |
|
|
| def _sent_labeler(self, sent: str): |
| """Label a single sentence and return characters and labels. |
| |
| Args: |
| sent: The sentence to be labeled |
| |
| Returns: |
| A tuple of (characters, labels) |
| """ |
| |
| labels = [0] * len(sent) |
| |
| characters = list(sent) |
| |
| deletable = [] |
|
|
| |
| for i in range(self.class_count): |
| |
| for match in re.finditer(self._regexes[i], sent): |
| |
| idx = match.start() |
| |
| labels[idx - 1] = self._tags[i] |
| |
| deletable.append(idx) |
|
|
| |
| deletable = sorted(deletable, reverse=True) |
|
|
| |
| for idx in deletable: |
| characters.pop(idx) |
| labels.pop(idx) |
|
|
| return characters, labels |
|
|
| def _text_labeler(self): |
| """Label the whole text and return characters and labels.""" |
| |
| labels = [0] * len(self.data) |
| |
| characters = list(self.data) |
| |
| deletable = [] |
|
|
| |
| for i in range(self.class_count): |
| |
| for match in re.finditer(self._regexes[i], self.data): |
| idx = match.start() |
| |
| labels[idx - 1] = self._tags[i] |
| |
| deletable.append(idx) |
|
|
| |
| deletable.sort(reverse=True) |
|
|
| |
| for idx in deletable: |
| del characters[idx] |
| del labels[idx] |
|
|
| return characters, labels |
|
|
| def _labeler(self): |
| """Label the data and return characters and labels.""" |
| |
| result_chars = [] |
| result_labels = [] |
|
|
| |
| if self.corpus_type == self.SENTS_RAW: |
| for sent in self.data: |
| |
| characters, labels = self._sent_labeler(sent) |
| result_chars.append(characters) |
| result_labels.append(labels) |
| elif self.corpus_type == self.WHOLE_RAW: |
| |
| result_chars, result_labels = self._text_labeler() |
| |
| result_chars = [result_chars] |
| result_labels = [result_labels] |
|
|
| return result_chars, result_labels |
|
|
| def label_text(self, textinput, corpus_type): |
| """Label text and return characters and labels. |
| |
| Args: |
| textinput: Either a string or a list of strings to label |
| corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW |
| |
| Returns: |
| A tuple of (characters, labels) |
| """ |
|
|
| |
| if corpus_type == self.WHOLE_RAW and isinstance(textinput, str): |
| self.data = textinput |
| self.corpus_type = corpus_type |
| elif corpus_type == self.SENTS_RAW and isinstance(textinput, list): |
| self.data = textinput |
| self.corpus_type = corpus_type |
| else: |
| raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type") |
|
|
| return self._labeler() |
|
|
| def _text_generator(self, chars, labels): |
| """Generate text with labels inserted. |
| |
| Args: |
| chars: A list of characters |
| labels: A list of labels for those characters |
| |
| Returns: |
| A string with class characters inserted according to the labels |
| """ |
| result = [] |
| for char, label in zip(chars, labels): |
| |
| result.append(char) |
|
|
| |
| if label != 0: |
| for i in range(self.class_count): |
| if label == self._tags[i]: |
| result.append(self._class_chars[i]) |
| break |
|
|
| return ''.join(result) |
|
|
| def text_generator(self, chars, labels, corpus_type): |
| """Generate text with labels inserted. |
| |
| Args: |
| chars: Either a list of characters or a list of lists of characters |
| labels: Either a list of labels or a list of lists of labels |
| corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW |
| |
| Returns: |
| Either a string or a list of strings with class characters inserted |
| """ |
| if corpus_type == self.SENTS_RAW: |
| |
| return [self._text_generator(sent_chars, sent_labels) |
| for sent_chars, sent_labels in zip(chars, labels)] |
| elif corpus_type == self.WHOLE_RAW: |
| |
| return self._text_generator(chars, labels) |