File size: 735 Bytes
05a7f14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import stopwords

import string

stop = set(stopwords.words('english') + list(string.punctuation))

def tokenize_quote(r):
    tokens = nltk.word_tokenize(r.lower())
    cleaned = [word for word in tokens if word not in stop]
    return cleaned

def lemmatize_tokens(tokens: list):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t) for t in tokens]
    
def lemmatize_X(X):
    return X.quote.apply(tokenize_quote).apply(lemmatize_tokens).apply(lambda x: " ".join(x))

def test_scorer(ytrue, ypred, blubb = 1):
    return F"this works! even with arguments {blubb}"