import nltk import re from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('wordnet') stop_words_en = stopwords.words("english") # Inisialisasi WordNetLemmatizer stemmer = PorterStemmer() # Function for text preprocessing def text_preprocessing(text): # Case folding text = text.lower() # Mention, hashtag, newline, URL, and non-letter removal text = re.sub("@[A-Za-z0-9_]+", " ", text) text = re.sub("#[A-Za-z0-9_]+", " ", text) text = re.sub(r"\\n", " ", text) text = re.sub(r"http\S+|www.\S+", " ", text) text = re.sub("[^A-Za-z\s']", " ", text) # Tokenization tokens = word_tokenize(text) # Stopwords removal tokens = [word for word in tokens if word not in stop_words_en] # Stemming tokens = [stemmer.stem(word) for word in tokens] # Combining Tokens text = ' '.join(tokens) return text