import re import numpy as np import sys # Set the print option for numpy arrays to display the whole array without truncation np.set_printoptions(threshold=sys.maxsize) class Labeler: # Define corpus types as class constants WHOLE_RAW = 'whole_raw' SENTS_RAW = 'sents_raw' def __init__(self, tags=(1, 2), regexes=(r'[^\S\r\n\v\f]', r'\u200c'), chars=(" ", "‌"), class_count=2): self._tags = tags self._regexes = regexes self._class_chars = chars self.class_count = class_count self.data = None self.labels = [] self.corpus_type = None def _sent_labeler(self, sent: str): """Label a single sentence and return characters and labels. Args: sent: The sentence to be labeled Returns: A tuple of (characters, labels) """ # Initialize an empty list to store the labels labels = [0] * len(sent) # Convert the input sentence into a list of characters for the output characters = list(sent) # Initialize an empty list to store the indices of characters to be deleted deletable = [] # Loop through the classes for i in range(self.class_count): # Find all the matches of the regular expression for the current class in the sentence for match in re.finditer(self._regexes[i], sent): # Get the index of the match idx = match.start() # Assign the corresponding tag to the label of the character before the match labels[idx - 1] = self._tags[i] # Add the index of the match to the list of deletable deletable.append(idx) # Sort the deletable in descending order to avoid index shifting deletable = sorted(deletable, reverse=True) # Remove the deletable characters and their labels for idx in deletable: characters.pop(idx) labels.pop(idx) return characters, labels def _text_labeler(self): """Label the whole text and return characters and labels.""" # Initialize labels with all zeros labels = [0] * len(self.data) # Convert characters to a list characters = list(self.data) # Track indices to delete deletable = [] # Loop through the classes for i in range(self.class_count): # Find all matches for the current class for match in re.finditer(self._regexes[i], self.data): idx = match.start() # Label the character before the match labels[idx - 1] = self._tags[i] # Mark this character for deletion deletable.append(idx) # Sort deletable indices in descending order deletable.sort(reverse=True) # Delete characters and labels at the specified indices for idx in deletable: del characters[idx] del labels[idx] return characters, labels def _labeler(self): """Label the data and return characters and labels.""" # Initialize empty lists for results result_chars = [] result_labels = [] # Process based on corpus type if self.corpus_type == self.SENTS_RAW: for sent in self.data: # Label each sentence individually characters, labels = self._sent_labeler(sent) result_chars.append(characters) result_labels.append(labels) elif self.corpus_type == self.WHOLE_RAW: # Label the entire text at once result_chars, result_labels = self._text_labeler() # Wrap the results in lists to maintain consistent return structure result_chars = [result_chars] result_labels = [result_labels] return result_chars, result_labels def label_text(self, textinput, corpus_type): """Label text and return characters and labels. Args: textinput: Either a string or a list of strings to label corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW Returns: A tuple of (characters, labels) """ # Validate input types if corpus_type == self.WHOLE_RAW and isinstance(textinput, str): self.data = textinput self.corpus_type = corpus_type elif corpus_type == self.SENTS_RAW and isinstance(textinput, list): self.data = textinput self.corpus_type = corpus_type else: raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type") return self._labeler() def _text_generator(self, chars, labels): """Generate text with labels inserted. Args: chars: A list of characters labels: A list of labels for those characters Returns: A string with class characters inserted according to the labels """ result = [] for char, label in zip(chars, labels): # Always add the character result.append(char) # Add class character if needed if label != 0: for i in range(self.class_count): if label == self._tags[i]: result.append(self._class_chars[i]) break return ''.join(result) def text_generator(self, chars, labels, corpus_type): """Generate text with labels inserted. Args: chars: Either a list of characters or a list of lists of characters labels: Either a list of labels or a list of lists of labels corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW Returns: Either a string or a list of strings with class characters inserted """ if corpus_type == self.SENTS_RAW: # Process each sentence separately return [self._text_generator(sent_chars, sent_labels) for sent_chars, sent_labels in zip(chars, labels)] elif corpus_type == self.WHOLE_RAW: # Process the whole text at once return self._text_generator(chars, labels)