File size: 4,243 Bytes
246df79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os.path

from nltk import *
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from Pinpoint_Internal.Logger import *

# If NLTK data doesn't exist, downloads it
try:
    tagged = pos_tag(["test"])
except LookupError:
    download()


# nltk.download() #todo how to get this to run once?

class sanitization():
    """
    This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small
    words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the
    serialised corpus is saved that is used unless this feature is overwritten.
    """

    def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False):
        """
        Entry function for sanitizing text
        :param text:
        :param force_new_data_and_dont_persisit:
        :return: sanitized text
        """
        sanitize_file_name = os.path.join(output_folder, "sanitized_text.txt")
        final_text = ""

        # If a file exists don't sanitize given text
        if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit:
            logger.print_message("Sanitized file exists. Using data")

            with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write:
                final_text = file_to_write.read()

        else:
            total_words = len(text.split(" "))
            number = 0
            logger.print_message("Starting sanitization... {} words to go".format(total_words))
            for word in text.split(" "):
                number = number + 1
                word = self.remove_non_alpha(word)
                word = self.lower(word)
                word = self.stemmer(word)
                word = self.remove_stop_words(word)
                word = self.remove_small_words(word)

                if word is None:
                    continue

                final_text = final_text + word + " "
                logger.print_message("Completed {} of {} sanitized words".format(number, total_words))

            final_text = final_text.replace("  ", " ")

            if not force_new_data_and_dont_persisit:
                with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write:
                    file_to_write.write(final_text)

        final_text = final_text.strip()
        return final_text

    def stemmer(self, word):
        """
        Get stemms of words
        :param word:
        :return: the stemmed word using port stemmer
        """

        porter = PorterStemmer()

        # todo anouther stemmer be assessed?
        # lancaster = LancasterStemmer()
        # stemmed_word = lancaster.stem(word)
        stemmed_word = porter.stem(word)

        return stemmed_word

    def lower(self, word):
        """
        get the lower case representation of words
        :param word:
        :return: the lowercase representation of the word
        """
        return word.lower()

    def remove_stop_words(self, text):
        """
        Remove stop words
        :param text:
        :return: the word without stop words
        """

        text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]

        final_string = ""

        for word in text_without_stopwords:
            final_string = final_string + word + " "

        return final_string

    def remove_non_alpha(self, word):
        """
        Removes non alphabet characters (Excluding spaces)
        :param word:
        :return: the word with non-alpha characters removed
        """
        word = word.replace("\n", " ").replace("\t", " ").replace("  ", " ")
        regex = re.compile('[^a-zA-Z ]')

        return regex.sub('', word)

    def remove_small_words(self, word, length_to_remove_if_not_equal=4):
        """
        Removes words that are too small, defaults to words words length 3 characters or below which are removed.
        :param word:
        :param length_to_remove_if_not_equal:
        :return: "" if word below 3 characters or the word if above
        """

        new_word = ""
        if len(word) >= length_to_remove_if_not_equal:
            new_word = word

        return new_word