|
|
import re |
|
|
import numpy as np |
|
|
import sys |
|
|
|
|
|
|
|
|
np.set_printoptions(threshold=sys.maxsize) |
|
|
|
|
|
|
|
|
class Labeler: |
|
|
|
|
|
WHOLE_RAW = 'whole_raw' |
|
|
SENTS_RAW = 'sents_raw' |
|
|
|
|
|
def __init__(self, tags=(1, 2), |
|
|
regexes=(r'[^\S\r\n\v\f]', r'\u200c'), |
|
|
chars=(" ", ""), |
|
|
class_count=2): |
|
|
self._tags = tags |
|
|
self._regexes = regexes |
|
|
self._class_chars = chars |
|
|
self.class_count = class_count |
|
|
|
|
|
self.data = None |
|
|
self.labels = [] |
|
|
self.corpus_type = None |
|
|
|
|
|
def _sent_labeler(self, sent: str): |
|
|
"""Label a single sentence and return characters and labels. |
|
|
|
|
|
Args: |
|
|
sent: The sentence to be labeled |
|
|
|
|
|
Returns: |
|
|
A tuple of (characters, labels) |
|
|
""" |
|
|
|
|
|
labels = [0] * len(sent) |
|
|
|
|
|
characters = list(sent) |
|
|
|
|
|
deletable = [] |
|
|
|
|
|
|
|
|
for i in range(self.class_count): |
|
|
|
|
|
for match in re.finditer(self._regexes[i], sent): |
|
|
|
|
|
idx = match.start() |
|
|
|
|
|
labels[idx - 1] = self._tags[i] |
|
|
|
|
|
deletable.append(idx) |
|
|
|
|
|
|
|
|
deletable = sorted(deletable, reverse=True) |
|
|
|
|
|
|
|
|
for idx in deletable: |
|
|
characters.pop(idx) |
|
|
labels.pop(idx) |
|
|
|
|
|
return characters, labels |
|
|
|
|
|
def _text_labeler(self): |
|
|
"""Label the whole text and return characters and labels.""" |
|
|
|
|
|
labels = [0] * len(self.data) |
|
|
|
|
|
characters = list(self.data) |
|
|
|
|
|
deletable = [] |
|
|
|
|
|
|
|
|
for i in range(self.class_count): |
|
|
|
|
|
for match in re.finditer(self._regexes[i], self.data): |
|
|
idx = match.start() |
|
|
|
|
|
labels[idx - 1] = self._tags[i] |
|
|
|
|
|
deletable.append(idx) |
|
|
|
|
|
|
|
|
deletable.sort(reverse=True) |
|
|
|
|
|
|
|
|
for idx in deletable: |
|
|
del characters[idx] |
|
|
del labels[idx] |
|
|
|
|
|
return characters, labels |
|
|
|
|
|
def _labeler(self): |
|
|
"""Label the data and return characters and labels.""" |
|
|
|
|
|
result_chars = [] |
|
|
result_labels = [] |
|
|
|
|
|
|
|
|
if self.corpus_type == self.SENTS_RAW: |
|
|
for sent in self.data: |
|
|
|
|
|
characters, labels = self._sent_labeler(sent) |
|
|
result_chars.append(characters) |
|
|
result_labels.append(labels) |
|
|
elif self.corpus_type == self.WHOLE_RAW: |
|
|
|
|
|
result_chars, result_labels = self._text_labeler() |
|
|
|
|
|
result_chars = [result_chars] |
|
|
result_labels = [result_labels] |
|
|
|
|
|
return result_chars, result_labels |
|
|
|
|
|
def label_text(self, textinput, corpus_type): |
|
|
"""Label text and return characters and labels. |
|
|
|
|
|
Args: |
|
|
textinput: Either a string or a list of strings to label |
|
|
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW |
|
|
|
|
|
Returns: |
|
|
A tuple of (characters, labels) |
|
|
""" |
|
|
|
|
|
|
|
|
if corpus_type == self.WHOLE_RAW and isinstance(textinput, str): |
|
|
self.data = textinput |
|
|
self.corpus_type = corpus_type |
|
|
elif corpus_type == self.SENTS_RAW and isinstance(textinput, list): |
|
|
self.data = textinput |
|
|
self.corpus_type = corpus_type |
|
|
else: |
|
|
raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type") |
|
|
|
|
|
return self._labeler() |
|
|
|
|
|
def _text_generator(self, chars, labels): |
|
|
"""Generate text with labels inserted. |
|
|
|
|
|
Args: |
|
|
chars: A list of characters |
|
|
labels: A list of labels for those characters |
|
|
|
|
|
Returns: |
|
|
A string with class characters inserted according to the labels |
|
|
""" |
|
|
result = [] |
|
|
for char, label in zip(chars, labels): |
|
|
|
|
|
result.append(char) |
|
|
|
|
|
|
|
|
if label != 0: |
|
|
for i in range(self.class_count): |
|
|
if label == self._tags[i]: |
|
|
result.append(self._class_chars[i]) |
|
|
break |
|
|
|
|
|
return ''.join(result) |
|
|
|
|
|
def text_generator(self, chars, labels, corpus_type): |
|
|
"""Generate text with labels inserted. |
|
|
|
|
|
Args: |
|
|
chars: Either a list of characters or a list of lists of characters |
|
|
labels: Either a list of labels or a list of lists of labels |
|
|
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW |
|
|
|
|
|
Returns: |
|
|
Either a string or a list of strings with class characters inserted |
|
|
""" |
|
|
if corpus_type == self.SENTS_RAW: |
|
|
|
|
|
return [self._text_generator(sent_chars, sent_labels) |
|
|
for sent_chars, sent_labels in zip(chars, labels)] |
|
|
elif corpus_type == self.WHOLE_RAW: |
|
|
|
|
|
return self._text_generator(chars, labels) |