sentiment_analysis / src /data /preprocess.py
Moncey10's picture
Upload 15 files
4112bd3 verified
raw
history blame contribute delete
434 Bytes
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))
def clean_text(text: str) -> str:
text = re.sub('[^a-zA-Z]', ' ', str(text))
words = text.lower().split()
words = [
port_stem.stem(word)
for word in words
if word not in stop_words
]
return ' '.join(words)