Spaces:
Sleeping
Sleeping
| SOS_token = 0 | |
| EOS_token = 1 | |
| UNK_token = 2 | |
| class Lang: | |
| def __init__(self, name): | |
| self.name = name | |
| self.word_to_index = {"SOS": 0, "EOS": 1, "UNK": 2} | |
| self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK"} | |
| self.word_to_count = {} | |
| self.n_words = 3 | |
| self.regex_pattern = r"[\W\s\d]+" | |
| self.eng_prefixes = [ | |
| "i will", "i ll" | |
| "i am ", "i m", | |
| "i have", "i ve", | |
| "he is", "he s", | |
| "she is", "she s", | |
| "you are", "you re", | |
| "we are", "we re", | |
| "they are", "they re", | |
| "i did", "i d" | |
| ] | |
| def addSentence(self, sentence): | |
| for word in sentence.lower().split(' '): | |
| word = re.sub(self.regex_pattern, ' ', word).strip() | |
| if self.name == 'English': | |
| if word in self.eng_prefixes: | |
| index = self.eng_prefixes.index(word) | |
| word = self.eng_prefixes[index-1] | |
| for subword in word.split(' '): | |
| if subword: | |
| self.addWord(subword.strip()) | |
| elif word != ' ' and word: | |
| self.addWord(word.strip()) | |
| elif word: | |
| self.addWord(word.strip()) | |
| def addWord(self, word): | |
| if word not in self.word_to_index: | |
| self.word_to_index[word] = self.n_words | |
| self.word_to_count[word] = 1 | |
| self.index_to_word[self.n_words] = word | |
| self.n_words += 1 | |
| else: | |
| self.word_to_count[word] += 1 | |