File size: 6,375 Bytes

07b65ad

import re
import numpy as np
import sys

# Set the print option for numpy arrays to display the whole array without truncation
np.set_printoptions(threshold=sys.maxsize)


class Labeler:
    # Define corpus types as class constants
    WHOLE_RAW = 'whole_raw'
    SENTS_RAW = 'sents_raw'

    def __init__(self, tags=(1, 2),
                 regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
                 chars=(" ", "‌"),
                 class_count=2):
        self._tags = tags
        self._regexes = regexes
        self._class_chars = chars
        self.class_count = class_count

        self.data = None
        self.labels = []
        self.corpus_type = None

    def _sent_labeler(self, sent: str):
        """Label a single sentence and return characters and labels.

        Args:
            sent: The sentence to be labeled

        Returns:
            A tuple of (characters, labels)
        """
        # Initialize an empty list to store the labels
        labels = [0] * len(sent)
        # Convert the input sentence into a list of characters for the output
        characters = list(sent)
        # Initialize an empty list to store the indices of characters to be deleted
        deletable = []

        # Loop through the classes
        for i in range(self.class_count):
            # Find all the matches of the regular expression for the current class in the sentence
            for match in re.finditer(self._regexes[i], sent):
                # Get the index of the match
                idx = match.start()
                # Assign the corresponding tag to the label of the character before the match
                labels[idx - 1] = self._tags[i]
                # Add the index of the match to the list of deletable
                deletable.append(idx)

        # Sort the deletable in descending order to avoid index shifting
        deletable = sorted(deletable, reverse=True)

        # Remove the deletable characters and their labels
        for idx in deletable:
            characters.pop(idx)
            labels.pop(idx)

        return characters, labels

    def _text_labeler(self):
        """Label the whole text and return characters and labels."""
        # Initialize labels with all zeros
        labels = [0] * len(self.data)
        # Convert characters to a list
        characters = list(self.data)
        # Track indices to delete
        deletable = []

        # Loop through the classes
        for i in range(self.class_count):
            # Find all matches for the current class
            for match in re.finditer(self._regexes[i], self.data):
                idx = match.start()
                # Label the character before the match
                labels[idx - 1] = self._tags[i]
                # Mark this character for deletion
                deletable.append(idx)

        # Sort deletable indices in descending order
        deletable.sort(reverse=True)

        # Delete characters and labels at the specified indices
        for idx in deletable:
            del characters[idx]
            del labels[idx]

        return characters, labels

    def _labeler(self):
        """Label the data and return characters and labels."""
        # Initialize empty lists for results
        result_chars = []
        result_labels = []

        # Process based on corpus type
        if self.corpus_type == self.SENTS_RAW:
            for sent in self.data:
                # Label each sentence individually
                characters, labels = self._sent_labeler(sent)
                result_chars.append(characters)
                result_labels.append(labels)
        elif self.corpus_type == self.WHOLE_RAW:
            # Label the entire text at once
            result_chars, result_labels = self._text_labeler()
            # Wrap the results in lists to maintain consistent return structure
            result_chars = [result_chars]
            result_labels = [result_labels]

        return result_chars, result_labels

    def label_text(self, textinput, corpus_type):
        """Label text and return characters and labels.

        Args:
            textinput: Either a string or a list of strings to label
            corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

        Returns:
            A tuple of (characters, labels)
        """

        # Validate input types
        if corpus_type == self.WHOLE_RAW and isinstance(textinput, str):
            self.data = textinput
            self.corpus_type = corpus_type
        elif corpus_type == self.SENTS_RAW and isinstance(textinput, list):
            self.data = textinput
            self.corpus_type = corpus_type
        else:
            raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type")

        return self._labeler()

    def _text_generator(self, chars, labels):
        """Generate text with labels inserted.

        Args:
            chars: A list of characters
            labels: A list of labels for those characters

        Returns:
            A string with class characters inserted according to the labels
        """
        result = []
        for char, label in zip(chars, labels):
            # Always add the character
            result.append(char)

            # Add class character if needed
            if label != 0:
                for i in range(self.class_count):
                    if label == self._tags[i]:
                        result.append(self._class_chars[i])
                        break

        return ''.join(result)

    def text_generator(self, chars, labels, corpus_type):
        """Generate text with labels inserted.

        Args:
            chars: Either a list of characters or a list of lists of characters
            labels: Either a list of labels or a list of lists of labels
            corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

        Returns:
            Either a string or a list of strings with class characters inserted
        """
        if corpus_type == self.SENTS_RAW:
            # Process each sentence separately
            return [self._text_generator(sent_chars, sent_labels)
                    for sent_chars, sent_labels in zip(chars, labels)]
        elif corpus_type == self.WHOLE_RAW:
            # Process the whole text at once
            return self._text_generator(chars, labels)