Spaces:

devinlee14
/

P2G7_deployment_devin_lee

Sleeping

Graded Challenge 7, by Devin Lee

30be50f about 2 years ago

966 Bytes

	import nltk
	import re
	from nltk.stem import PorterStemmer
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords

	nltk.download('stopwords')
	nltk.download('wordnet')

	stop_words_en = stopwords.words("english")

	# Inisialisasi WordNetLemmatizer
	stemmer = PorterStemmer()
	# Function for text preprocessing
	def text_preprocessing(text):
	# Case folding
	text = text.lower()

	# Mention, hashtag, newline, URL, and non-letter removal
	text = re.sub("@[A-Za-z0-9_]+", " ", text)
	text = re.sub("#[A-Za-z0-9_]+", " ", text)
	text = re.sub(r"\\n", " ", text)
	text = re.sub(r"http\S+\|www.\S+", " ", text)
	text = re.sub("[^A-Za-z\s']", " ", text)

	# Tokenization
	tokens = word_tokenize(text)

	# Stopwords removal
	tokens = [word for word in tokens if word not in stop_words_en]

	# Stemming
	tokens = [stemmer.stem(word) for word in tokens]

	# Combining Tokens
	text = ' '.join(tokens)

	return text