File size: 434 Bytes
4112bd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    words = text.lower().split()

    words = [
        port_stem.stem(word)
        for word in words
        if word not in stop_words
    ]

    return ' '.join(words)