Spaces:

User1342
/

Pinpoint-Web

Sleeping

Pinpoint-Web / Pinpoint_Internal /Sanitizer.py

James Stevenson

added lib

246df79 almost 4 years ago

4.24 kB

	import os.path

	from nltk import *
	from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

	from Pinpoint_Internal.Logger import *

	# If NLTK data doesn't exist, downloads it
	try:
	tagged = pos_tag(["test"])
	except LookupError:
	download()


	# nltk.download() #todo how to get this to run once?

	class sanitization():
	"""
	This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small
	words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the
	serialised corpus is saved that is used unless this feature is overwritten.
	"""

	def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False):
	"""
	Entry function for sanitizing text
	:param text:
	:param force_new_data_and_dont_persisit:
	:return: sanitized text
	"""
	sanitize_file_name = os.path.join(output_folder, "sanitized_text.txt")
	final_text = ""

	# If a file exists don't sanitize given text
	if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit:
	logger.print_message("Sanitized file exists. Using data")

	with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write:
	final_text = file_to_write.read()

	else:
	total_words = len(text.split(" "))
	number = 0
	logger.print_message("Starting sanitization... {} words to go".format(total_words))
	for word in text.split(" "):
	number = number + 1
	word = self.remove_non_alpha(word)
	word = self.lower(word)
	word = self.stemmer(word)
	word = self.remove_stop_words(word)
	word = self.remove_small_words(word)

	if word is None:
	continue

	final_text = final_text + word + " "
	logger.print_message("Completed {} of {} sanitized words".format(number, total_words))

	final_text = final_text.replace(" ", " ")

	if not force_new_data_and_dont_persisit:
	with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write:
	file_to_write.write(final_text)

	final_text = final_text.strip()
	return final_text

	def stemmer(self, word):
	"""
	Get stemms of words
	:param word:
	:return: the stemmed word using port stemmer
	"""

	porter = PorterStemmer()

	# todo anouther stemmer be assessed?
	# lancaster = LancasterStemmer()
	# stemmed_word = lancaster.stem(word)
	stemmed_word = porter.stem(word)

	return stemmed_word

	def lower(self, word):
	"""
	get the lower case representation of words
	:param word:
	:return: the lowercase representation of the word
	"""
	return word.lower()

	def remove_stop_words(self, text):
	"""
	Remove stop words
	:param text:
	:return: the word without stop words
	"""

	text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]

	final_string = ""

	for word in text_without_stopwords:
	final_string = final_string + word + " "

	return final_string

	def remove_non_alpha(self, word):
	"""
	Removes non alphabet characters (Excluding spaces)
	:param word:
	:return: the word with non-alpha characters removed
	"""
	word = word.replace("\n", " ").replace("\t", " ").replace(" ", " ")
	regex = re.compile('[^a-zA-Z ]')

	return regex.sub('', word)

	def remove_small_words(self, word, length_to_remove_if_not_equal=4):
	"""
	Removes words that are too small, defaults to words words length 3 characters or below which are removed.
	:param word:
	:param length_to_remove_if_not_equal:
	:return: "" if word below 3 characters or the word if above
	"""

	new_word = ""
	if len(word) >= length_to_remove_if_not_equal:
	new_word = word

	return new_word