DualStep-DropNet / labeler.py

Upload the model

07b65ad verified 10 months ago

6.38 kB

	import re
	import numpy as np
	import sys

	# Set the print option for numpy arrays to display the whole array without truncation
	np.set_printoptions(threshold=sys.maxsize)


	class Labeler:
	# Define corpus types as class constants
	WHOLE_RAW = 'whole_raw'
	SENTS_RAW = 'sents_raw'

	def __init__(self, tags=(1, 2),
	regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
	chars=(" ", "‌"),
	class_count=2):
	self._tags = tags
	self._regexes = regexes
	self._class_chars = chars
	self.class_count = class_count

	self.data = None
	self.labels = []
	self.corpus_type = None

	def _sent_labeler(self, sent: str):
	"""Label a single sentence and return characters and labels.

	Args:
	sent: The sentence to be labeled

	Returns:
	A tuple of (characters, labels)
	"""
	# Initialize an empty list to store the labels
	labels = [0] * len(sent)
	# Convert the input sentence into a list of characters for the output
	characters = list(sent)
	# Initialize an empty list to store the indices of characters to be deleted
	deletable = []

	# Loop through the classes
	for i in range(self.class_count):
	# Find all the matches of the regular expression for the current class in the sentence
	for match in re.finditer(self._regexes[i], sent):
	# Get the index of the match
	idx = match.start()
	# Assign the corresponding tag to the label of the character before the match
	labels[idx - 1] = self._tags[i]
	# Add the index of the match to the list of deletable
	deletable.append(idx)

	# Sort the deletable in descending order to avoid index shifting
	deletable = sorted(deletable, reverse=True)

	# Remove the deletable characters and their labels
	for idx in deletable:
	characters.pop(idx)
	labels.pop(idx)

	return characters, labels

	def _text_labeler(self):
	"""Label the whole text and return characters and labels."""
	# Initialize labels with all zeros
	labels = [0] * len(self.data)
	# Convert characters to a list
	characters = list(self.data)
	# Track indices to delete
	deletable = []

	# Loop through the classes
	for i in range(self.class_count):
	# Find all matches for the current class
	for match in re.finditer(self._regexes[i], self.data):
	idx = match.start()
	# Label the character before the match
	labels[idx - 1] = self._tags[i]
	# Mark this character for deletion
	deletable.append(idx)

	# Sort deletable indices in descending order
	deletable.sort(reverse=True)

	# Delete characters and labels at the specified indices
	for idx in deletable:
	del characters[idx]
	del labels[idx]

	return characters, labels

	def _labeler(self):
	"""Label the data and return characters and labels."""
	# Initialize empty lists for results
	result_chars = []
	result_labels = []

	# Process based on corpus type
	if self.corpus_type == self.SENTS_RAW:
	for sent in self.data:
	# Label each sentence individually
	characters, labels = self._sent_labeler(sent)
	result_chars.append(characters)
	result_labels.append(labels)
	elif self.corpus_type == self.WHOLE_RAW:
	# Label the entire text at once
	result_chars, result_labels = self._text_labeler()
	# Wrap the results in lists to maintain consistent return structure
	result_chars = [result_chars]
	result_labels = [result_labels]

	return result_chars, result_labels

	def label_text(self, textinput, corpus_type):
	"""Label text and return characters and labels.

	Args:
	textinput: Either a string or a list of strings to label
	corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

	Returns:
	A tuple of (characters, labels)
	"""

	# Validate input types
	if corpus_type == self.WHOLE_RAW and isinstance(textinput, str):
	self.data = textinput
	self.corpus_type = corpus_type
	elif corpus_type == self.SENTS_RAW and isinstance(textinput, list):
	self.data = textinput
	self.corpus_type = corpus_type
	else:
	raise ValueError(f"Invalid input: expected {corpus_type} with compatible data type")

	return self._labeler()

	def _text_generator(self, chars, labels):
	"""Generate text with labels inserted.

	Args:
	chars: A list of characters
	labels: A list of labels for those characters

	Returns:
	A string with class characters inserted according to the labels
	"""
	result = []
	for char, label in zip(chars, labels):
	# Always add the character
	result.append(char)

	# Add class character if needed
	if label != 0:
	for i in range(self.class_count):
	if label == self._tags[i]:
	result.append(self._class_chars[i])
	break

	return ''.join(result)

	def text_generator(self, chars, labels, corpus_type):
	"""Generate text with labels inserted.

	Args:
	chars: Either a list of characters or a list of lists of characters
	labels: Either a list of labels or a list of lists of labels
	corpus_type: Either Labeler.WHOLE_RAW or Labeler.SENTS_RAW

	Returns:
	Either a string or a list of strings with class characters inserted
	"""
	if corpus_type == self.SENTS_RAW:
	# Process each sentence separately
	return [self._text_generator(sent_chars, sent_labels)
	for sent_chars, sent_labels in zip(chars, labels)]
	elif corpus_type == self.WHOLE_RAW:
	# Process the whole text at once
	return self._text_generator(chars, labels)