Spaces:
Sleeping
Sleeping
File size: 4,243 Bytes
246df79 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import os.path
from nltk import *
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from Pinpoint_Internal.Logger import *
# If NLTK data doesn't exist, downloads it
try:
tagged = pos_tag(["test"])
except LookupError:
download()
# nltk.download() #todo how to get this to run once?
class sanitization():
"""
This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small
words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the
serialised corpus is saved that is used unless this feature is overwritten.
"""
def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False):
"""
Entry function for sanitizing text
:param text:
:param force_new_data_and_dont_persisit:
:return: sanitized text
"""
sanitize_file_name = os.path.join(output_folder, "sanitized_text.txt")
final_text = ""
# If a file exists don't sanitize given text
if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit:
logger.print_message("Sanitized file exists. Using data")
with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write:
final_text = file_to_write.read()
else:
total_words = len(text.split(" "))
number = 0
logger.print_message("Starting sanitization... {} words to go".format(total_words))
for word in text.split(" "):
number = number + 1
word = self.remove_non_alpha(word)
word = self.lower(word)
word = self.stemmer(word)
word = self.remove_stop_words(word)
word = self.remove_small_words(word)
if word is None:
continue
final_text = final_text + word + " "
logger.print_message("Completed {} of {} sanitized words".format(number, total_words))
final_text = final_text.replace(" ", " ")
if not force_new_data_and_dont_persisit:
with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write:
file_to_write.write(final_text)
final_text = final_text.strip()
return final_text
def stemmer(self, word):
"""
Get stemms of words
:param word:
:return: the stemmed word using port stemmer
"""
porter = PorterStemmer()
# todo anouther stemmer be assessed?
# lancaster = LancasterStemmer()
# stemmed_word = lancaster.stem(word)
stemmed_word = porter.stem(word)
return stemmed_word
def lower(self, word):
"""
get the lower case representation of words
:param word:
:return: the lowercase representation of the word
"""
return word.lower()
def remove_stop_words(self, text):
"""
Remove stop words
:param text:
:return: the word without stop words
"""
text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
final_string = ""
for word in text_without_stopwords:
final_string = final_string + word + " "
return final_string
def remove_non_alpha(self, word):
"""
Removes non alphabet characters (Excluding spaces)
:param word:
:return: the word with non-alpha characters removed
"""
word = word.replace("\n", " ").replace("\t", " ").replace(" ", " ")
regex = re.compile('[^a-zA-Z ]')
return regex.sub('', word)
def remove_small_words(self, word, length_to_remove_if_not_equal=4):
"""
Removes words that are too small, defaults to words words length 3 characters or below which are removed.
:param word:
:param length_to_remove_if_not_equal:
:return: "" if word below 3 characters or the word if above
"""
new_word = ""
if len(word) >= length_to_remove_if_not_equal:
new_word = word
return new_word
|