Spaces:
Sleeping
Sleeping
File size: 1,166 Bytes
b758d48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | from src.constants import DEVICE
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from typing import List
from collections import Counter
def clean_text(text: str, lang: str = 'english') -> str:
if not text:
return ""
# lowercase
text = text.lower()
# tokenize
tokens = word_tokenize(text)
# get stopword list for language
stop_words = set(stopwords.words(lang))
# filter stopwords
tokens = [t for t in tokens if t not in stop_words]
# join back to string
return " ".join(tokens)
def build_vocabs(sentences: List[str]):
vocab = Counter(' '.join(sentences).split())
vocab = {k: i + 3 for i, (k, v) in enumerate(vocab.items())}
vocab['<pad>'] = 0
vocab['<pos>'] = 1
vocab['<eos>'] = 2
return vocab
def sent_tokens(sentence: str, vocab):
tokens = [vocab.get('<pos>', 1)]
for w in sentence.split():
if w in vocab:
tokens.append(vocab[w])
elif '<unk>' in vocab:
tokens.append(vocab['<unk>'])
# If no <unk> and not in vocab, we skip it to avoid KeyError
tokens.append(vocab.get('<eos>', 2))
return tokens |