File size: 6,375 Bytes
07b65ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import re
import numpy as np
import sys

# Set the print option for numpy arrays to display the whole array without truncation
np.set_printoptions(threshold=sys.maxsize)


class Labeler:
    # Define corpus types as class constants
    WHOLE_RAW = 'whole_raw'
    SENTS_RAW = 'sents_raw'

    def __init__(self, tags=(1, 2),
                 regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
                 chars=(" ", "‌"),
                 class_count=2):
        self._tags = tags
        self._regexes = regexes
        self._class_chars = chars
        self.class_count = class_count

        self.data = None
        self.labels = []
        self.corpus_type = None

    def _sent_labeler(self, sent: str):
        """Label a single sentence and return characters and labels.

        Args:
            sent: The sentence to be labeled

        Returns:
            A tuple of (characters, labels)
        """
        # Initialize an empty list to store the labels
        labels = [0] * len(sent)
        # Convert the input sentence into a list of characters for the output
        characters = list(sent)
        # Initialize an empty list to store the indices of characters to be deleted
        deletable = []

        # Loop through the classes
        for i in range(self.class_count):
            # Find all the matches of the regular expression for the current class in the sentence
            for match in re.finditer(self._regexes[i], sent):
                # Get the index of the match
                idx = match.start()
                # Assign the corresponding tag to the label of the character before the match
                labels[idx - 1] = self._tags[i]
                # Add the index of the match to the list of deletable
                deletable.append(idx)

        # Sort the deletable in descending order to avoid index shifting
        deletable = sorted(deletable, reverse=True)

        # Remove the deletable characters and their labels
        for idx in deletable:
            characters.pop(idx)
            labels.pop(idx)

        return characters, labels

    def _text_labeler(self):
        """Label the whole text and return characters and labels."""
        # Initialize labels with all zeros
        labels = [0] * len(self.data)
        # Convert characters to a list
        characters = list(self.data)
        # Track indices to delete
        deletable = []

        # Loop through the classes
        for i in range(self.class_count):
            # Find all matches for the current class
            for match in re.finditer(self._regexes[i], self.data):
                idx = match.start()
                # Label the character before the match
                labels[idx - 1] = self._tags[i]
                # Mark this character for deletion
                deletable.append(idx)

        # Sort deletable indices in descending order
        deletable.sort(reverse=True)

        # Delete characters and labels at the specified indices
        for idx in deletable:
            del characters[idx]
            del labels[idx]

        return characters, labels

    def _labeler(self):
        """Label the data and return characters and labels."""
        # Initialize empty lists for results
        result_chars = []
        result_labels = []

        # Process based on corpus type
        if self.corpus_type == self.SENTS_RAW:
            for sent in self.data:
                # Label each sentence individually
                characters, labels = self._sent_labeler(sent)
                result_chars.append(characters)
                result_labels.append(labels)
        elif self.corpus_type == self.WHOLE_RAW:
            # Label the entire text at once
            result_chars, result_labels = self._text_labeler()
            # Wrap the results in lists to maintain consistent return structure
            result_chars = [result_chars]
            result_labels = [result_labels]

        return result_chars, result_labels

    def label_text(self, textinput, corpus_type):
        """Label text and return characters and labels.

        Args:
            textinput: Either a string or a list of strings to label
            corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

        Returns:
            A tuple of (characters, labels)
        """

        # Validate input types
        if corpus_type == self.WHOLE_RAW and isinstance(textinput, str):
            self.data = textinput
            self.corpus_type = corpus_type
        elif corpus_type == self.SENTS_RAW and isinstance(textinput, list):
            self.data = textinput
            self.corpus_type = corpus_type
        else:
            raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type")

        return self._labeler()

    def _text_generator(self, chars, labels):
        """Generate text with labels inserted.

        Args:
            chars: A list of characters
            labels: A list of labels for those characters

        Returns:
            A string with class characters inserted according to the labels
        """
        result = []
        for char, label in zip(chars, labels):
            # Always add the character
            result.append(char)

            # Add class character if needed
            if label != 0:
                for i in range(self.class_count):
                    if label == self._tags[i]:
                        result.append(self._class_chars[i])
                        break

        return ''.join(result)

    def text_generator(self, chars, labels, corpus_type):
        """Generate text with labels inserted.

        Args:
            chars: Either a list of characters or a list of lists of characters
            labels: Either a list of labels or a list of lists of labels
            corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

        Returns:
            Either a string or a list of strings with class characters inserted
        """
        if corpus_type == self.SENTS_RAW:
            # Process each sentence separately
            return [self._text_generator(sent_chars, sent_labels)
                    for sent_chars, sent_labels in zip(chars, labels)]
        elif corpus_type == self.WHOLE_RAW:
            # Process the whole text at once
            return self._text_generator(chars, labels)