File size: 1,617 Bytes
ed8878f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.word_to_index = {"SOS": 0, "EOS": 1, "UNK": 2}
        self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK"}
        self.word_to_count = {}
        self.n_words = 3
        
        self.regex_pattern = r"[\W\s\d]+"
        self.eng_prefixes = [
            "i will", "i ll"
            "i am ", "i m",
            "i have", "i ve",
            "he is", "he s",
            "she is", "she s",
            "you are", "you re",
            "we are", "we re",
            "they are", "they re",
            "i did", "i d"
        ]

    def addSentence(self, sentence):
        for word in sentence.lower().split(' '):
            word = re.sub(self.regex_pattern, ' ', word).strip()
        
            if self.name == 'English':
                if word in self.eng_prefixes:
                    index = self.eng_prefixes.index(word)
                    word = self.eng_prefixes[index-1]
                    for subword in word.split(' '):
                        if subword:
                            self.addWord(subword.strip())

                elif word != ' ' and word:
                    self.addWord(word.strip())

            elif word:
                self.addWord(word.strip())

    def addWord(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.n_words
            self.word_to_count[word] = 1
            self.index_to_word[self.n_words] = word
            self.n_words += 1
        else:
            self.word_to_count[word] += 1