Spaces:
Sleeping
Sleeping
File size: 1,617 Bytes
ed8878f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
SOS_token = 0
EOS_token = 1
UNK_token = 2
class Lang:
def __init__(self, name):
self.name = name
self.word_to_index = {"SOS": 0, "EOS": 1, "UNK": 2}
self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK"}
self.word_to_count = {}
self.n_words = 3
self.regex_pattern = r"[\W\s\d]+"
self.eng_prefixes = [
"i will", "i ll"
"i am ", "i m",
"i have", "i ve",
"he is", "he s",
"she is", "she s",
"you are", "you re",
"we are", "we re",
"they are", "they re",
"i did", "i d"
]
def addSentence(self, sentence):
for word in sentence.lower().split(' '):
word = re.sub(self.regex_pattern, ' ', word).strip()
if self.name == 'English':
if word in self.eng_prefixes:
index = self.eng_prefixes.index(word)
word = self.eng_prefixes[index-1]
for subword in word.split(' '):
if subword:
self.addWord(subword.strip())
elif word != ' ' and word:
self.addWord(word.strip())
elif word:
self.addWord(word.strip())
def addWord(self, word):
if word not in self.word_to_index:
self.word_to_index[word] = self.n_words
self.word_to_count[word] = 1
self.index_to_word[self.n_words] = word
self.n_words += 1
else:
self.word_to_count[word] += 1
|