DualStep-DropNet / labeler.py
matin-ebrahimkhani's picture
Upload the model
07b65ad verified
import re
import numpy as np
import sys
# Set the print option for numpy arrays to display the whole array without truncation
np.set_printoptions(threshold=sys.maxsize)
class Labeler:
# Define corpus types as class constants
WHOLE_RAW = 'whole_raw'
SENTS_RAW = 'sents_raw'
def __init__(self, tags=(1, 2),
regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
chars=(" ", "‌"),
class_count=2):
self._tags = tags
self._regexes = regexes
self._class_chars = chars
self.class_count = class_count
self.data = None
self.labels = []
self.corpus_type = None
def _sent_labeler(self, sent: str):
"""Label a single sentence and return characters and labels.
Args:
sent: The sentence to be labeled
Returns:
A tuple of (characters, labels)
"""
# Initialize an empty list to store the labels
labels = [0] * len(sent)
# Convert the input sentence into a list of characters for the output
characters = list(sent)
# Initialize an empty list to store the indices of characters to be deleted
deletable = []
# Loop through the classes
for i in range(self.class_count):
# Find all the matches of the regular expression for the current class in the sentence
for match in re.finditer(self._regexes[i], sent):
# Get the index of the match
idx = match.start()
# Assign the corresponding tag to the label of the character before the match
labels[idx - 1] = self._tags[i]
# Add the index of the match to the list of deletable
deletable.append(idx)
# Sort the deletable in descending order to avoid index shifting
deletable = sorted(deletable, reverse=True)
# Remove the deletable characters and their labels
for idx in deletable:
characters.pop(idx)
labels.pop(idx)
return characters, labels
def _text_labeler(self):
"""Label the whole text and return characters and labels."""
# Initialize labels with all zeros
labels = [0] * len(self.data)
# Convert characters to a list
characters = list(self.data)
# Track indices to delete
deletable = []
# Loop through the classes
for i in range(self.class_count):
# Find all matches for the current class
for match in re.finditer(self._regexes[i], self.data):
idx = match.start()
# Label the character before the match
labels[idx - 1] = self._tags[i]
# Mark this character for deletion
deletable.append(idx)
# Sort deletable indices in descending order
deletable.sort(reverse=True)
# Delete characters and labels at the specified indices
for idx in deletable:
del characters[idx]
del labels[idx]
return characters, labels
def _labeler(self):
"""Label the data and return characters and labels."""
# Initialize empty lists for results
result_chars = []
result_labels = []
# Process based on corpus type
if self.corpus_type == self.SENTS_RAW:
for sent in self.data:
# Label each sentence individually
characters, labels = self._sent_labeler(sent)
result_chars.append(characters)
result_labels.append(labels)
elif self.corpus_type == self.WHOLE_RAW:
# Label the entire text at once
result_chars, result_labels = self._text_labeler()
# Wrap the results in lists to maintain consistent return structure
result_chars = [result_chars]
result_labels = [result_labels]
return result_chars, result_labels
def label_text(self, textinput, corpus_type):
"""Label text and return characters and labels.
Args:
textinput: Either a string or a list of strings to label
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW
Returns:
A tuple of (characters, labels)
"""
# Validate input types
if corpus_type == self.WHOLE_RAW and isinstance(textinput, str):
self.data = textinput
self.corpus_type = corpus_type
elif corpus_type == self.SENTS_RAW and isinstance(textinput, list):
self.data = textinput
self.corpus_type = corpus_type
else:
raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type")
return self._labeler()
def _text_generator(self, chars, labels):
"""Generate text with labels inserted.
Args:
chars: A list of characters
labels: A list of labels for those characters
Returns:
A string with class characters inserted according to the labels
"""
result = []
for char, label in zip(chars, labels):
# Always add the character
result.append(char)
# Add class character if needed
if label != 0:
for i in range(self.class_count):
if label == self._tags[i]:
result.append(self._class_chars[i])
break
return ''.join(result)
def text_generator(self, chars, labels, corpus_type):
"""Generate text with labels inserted.
Args:
chars: Either a list of characters or a list of lists of characters
labels: Either a list of labels or a list of lists of labels
corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW
Returns:
Either a string or a list of strings with class characters inserted
"""
if corpus_type == self.SENTS_RAW:
# Process each sentence separately
return [self._text_generator(sent_chars, sent_labels)
for sent_chars, sent_labels in zip(chars, labels)]
elif corpus_type == self.WHOLE_RAW:
# Process the whole text at once
return self._text_generator(chars, labels)