File size: 966 Bytes
30be50f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words_en = stopwords.words("english")
# Inisialisasi WordNetLemmatizer
stemmer = PorterStemmer()
# Function for text preprocessing
def text_preprocessing(text):
# Case folding
text = text.lower()
# Mention, hashtag, newline, URL, and non-letter removal
text = re.sub("@[A-Za-z0-9_]+", " ", text)
text = re.sub("#[A-Za-z0-9_]+", " ", text)
text = re.sub(r"\\n", " ", text)
text = re.sub(r"http\S+|www.\S+", " ", text)
text = re.sub("[^A-Za-z\s']", " ", text)
# Tokenization
tokens = word_tokenize(text)
# Stopwords removal
tokens = [word for word in tokens if word not in stop_words_en]
# Stemming
tokens = [stemmer.stem(word) for word in tokens]
# Combining Tokens
text = ' '.join(tokens)
return text |