| import nltk | |
| from nltk.stem import WordNetLemmatizer | |
| nltk.download("stopwords") | |
| nltk.download('punkt_tab') | |
| nltk.download('wordnet') | |
| from nltk.corpus import stopwords | |
| import string | |
| stop = set(stopwords.words('english') + list(string.punctuation)) | |
| def tokenize_quote(r): | |
| tokens = nltk.word_tokenize(r.lower()) | |
| cleaned = [word for word in tokens if word not in stop] | |
| return cleaned | |
| def lemmatize_tokens(tokens: list): | |
| lemmatizer = WordNetLemmatizer() | |
| return [lemmatizer.lemmatize(t) for t in tokens] | |
| def lemmatize_X(X): | |
| return X.quote.apply(tokenize_quote).apply(lemmatize_tokens).apply(lambda x: " ".join(x)) | |
| def test_scorer(ytrue, ypred, blubb = 1): | |
| return F"this works! even with arguments {blubb}" |