vishnuraggav's picture
First
ed8878f
SOS_token = 0
EOS_token = 1
UNK_token = 2
class Lang:
def __init__(self, name):
self.name = name
self.word_to_index = {"SOS": 0, "EOS": 1, "UNK": 2}
self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK"}
self.word_to_count = {}
self.n_words = 3
self.regex_pattern = r"[\W\s\d]+"
self.eng_prefixes = [
"i will", "i ll"
"i am ", "i m",
"i have", "i ve",
"he is", "he s",
"she is", "she s",
"you are", "you re",
"we are", "we re",
"they are", "they re",
"i did", "i d"
]
def addSentence(self, sentence):
for word in sentence.lower().split(' '):
word = re.sub(self.regex_pattern, ' ', word).strip()
if self.name == 'English':
if word in self.eng_prefixes:
index = self.eng_prefixes.index(word)
word = self.eng_prefixes[index-1]
for subword in word.split(' '):
if subword:
self.addWord(subword.strip())
elif word != ' ' and word:
self.addWord(word.strip())
elif word:
self.addWord(word.strip())
def addWord(self, word):
if word not in self.word_to_index:
self.word_to_index[word] = self.n_words
self.word_to_count[word] = 1
self.index_to_word[self.n_words] = word
self.n_words += 1
else:
self.word_to_count[word] += 1