File size: 6,375 Bytes
07b65ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import re
import numpy as np
import sys
# Set the print option for numpy arrays to display the whole array without truncation
np.set_printoptions(threshold=sys.maxsize)
class Labeler:
# Define corpus types as class constants
WHOLE_RAW = 'whole_raw'
SENTS_RAW = 'sents_raw'
def __init__(self, tags=(1, 2),
regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
chars=(" ", ""),
class_count=2):
self._tags = tags
self._regexes = regexes
self._class_chars = chars
self.class_count = class_count
self.data = None
self.labels = []
self.corpus_type = None
def _sent_labeler(self, sent: str):
"""Label a single sentence and return characters and labels.
Args:
sent: The sentence to be labeled
Returns:
A tuple of (characters, labels)
"""
# Initialize an empty list to store the labels
labels = [0] * len(sent)
# Convert the input sentence into a list of characters for the output
characters = list(sent)
# Initialize an empty list to store the indices of characters to be deleted
deletable = []
# Loop through the classes
for i in range(self.class_count):
# Find all the matches of the regular expression for the current class in the sentence
for match in re.finditer(self._regexes[i], sent):
# Get the index of the match
idx = match.start()
# Assign the corresponding tag to the label of the character before the match
labels[idx - 1] = self._tags[i]
# Add the index of the match to the list of deletable
deletable.append(idx)
# Sort the deletable in descending order to avoid index shifting
deletable = sorted(deletable, reverse=True)
# Remove the deletable characters and their labels
for idx in deletable:
characters.pop(idx)
labels.pop(idx)
return characters, labels
def _text_labeler(self):
"""Label the whole text and return characters and labels."""
# Initialize labels with all zeros
labels = [0] * len(self.data)
# Convert characters to a list
characters = list(self.data)
# Track indices to delete
deletable = []
# Loop through the classes
for i in range(self.class_count):
# Find all matches for the current class
for match in re.finditer(self._regexes[i], self.data):
idx = match.start()
# Label the character before the match
labels[idx - 1] = self._tags[i]
# Mark this character for deletion
deletable.append(idx)
# Sort deletable indices in descending order
deletable.sort(reverse=True)
# Delete characters and labels at the specified indices
for idx in deletable:
del characters[idx]
del labels[idx]
return characters, labels
def _labeler(self):
"""Label the data and return characters and labels."""
# Initialize empty lists for results
result_chars = []
result_labels = []
# Process based on corpus type
if self.corpus_type == self.SENTS_RAW:
for sent in self.data:
# Label each sentence individually
characters, labels = self._sent_labeler(sent)
result_chars.append(characters)
result_labels.append(labels)
elif self.corpus_type == self.WHOLE_RAW:
# Label the entire text at once
result_chars, result_labels = self._text_labeler()
# Wrap the results in lists to maintain consistent return structure
result_chars = [result_chars]
result_labels = [result_labels]
return result_chars, result_labels
def label_text(self, textinput, corpus_type):
"""Label text and return characters and labels.
Args:
textinput: Either a string or a list of strings to label
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW
Returns:
A tuple of (characters, labels)
"""
# Validate input types
if corpus_type == self.WHOLE_RAW and isinstance(textinput, str):
self.data = textinput
self.corpus_type = corpus_type
elif corpus_type == self.SENTS_RAW and isinstance(textinput, list):
self.data = textinput
self.corpus_type = corpus_type
else:
raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type")
return self._labeler()
def _text_generator(self, chars, labels):
"""Generate text with labels inserted.
Args:
chars: A list of characters
labels: A list of labels for those characters
Returns:
A string with class characters inserted according to the labels
"""
result = []
for char, label in zip(chars, labels):
# Always add the character
result.append(char)
# Add class character if needed
if label != 0:
for i in range(self.class_count):
if label == self._tags[i]:
result.append(self._class_chars[i])
break
return ''.join(result)
def text_generator(self, chars, labels, corpus_type):
"""Generate text with labels inserted.
Args:
chars: Either a list of characters or a list of lists of characters
labels: Either a list of labels or a list of lists of labels
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW
Returns:
Either a string or a list of strings with class characters inserted
"""
if corpus_type == self.SENTS_RAW:
# Process each sentence separately
return [self._text_generator(sent_chars, sent_labels)
for sent_chars, sent_labels in zip(chars, labels)]
elif corpus_type == self.WHOLE_RAW:
# Process the whole text at once
return self._text_generator(chars, labels) |