SOS_token = 0 EOS_token = 1 UNK_token = 2 class Lang: def __init__(self, name): self.name = name self.word_to_index = {"SOS": 0, "EOS": 1, "UNK": 2} self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK"} self.word_to_count = {} self.n_words = 3 self.regex_pattern = r"[\W\s\d]+" self.eng_prefixes = [ "i will", "i ll" "i am ", "i m", "i have", "i ve", "he is", "he s", "she is", "she s", "you are", "you re", "we are", "we re", "they are", "they re", "i did", "i d" ] def addSentence(self, sentence): for word in sentence.lower().split(' '): word = re.sub(self.regex_pattern, ' ', word).strip() if self.name == 'English': if word in self.eng_prefixes: index = self.eng_prefixes.index(word) word = self.eng_prefixes[index-1] for subword in word.split(' '): if subword: self.addWord(subword.strip()) elif word != ' ' and word: self.addWord(word.strip()) elif word: self.addWord(word.strip()) def addWord(self, word): if word not in self.word_to_index: self.word_to_index[word] = self.n_words self.word_to_count[word] = 1 self.index_to_word[self.n_words] = word self.n_words += 1 else: self.word_to_count[word] += 1