Spaces:

devinlee14
/

test

Sleeping

test / functions /text_preprocessed.py

Upload 28 files

e0c55bb about 2 years ago

1.01 kB

	"""
	// function.py //
	This programme was created to store the function used through out this project.
	"""


	import re
	from nltk.tokenize import word_tokenize

	# Create A Function for Text Preprocessing
	def text_preprocessing(text, lemmatizer, sw):
	# Case folding
	text = text.lower()

	# Mention removal
	text = re.sub("@[A-Za-z0-9_]+", " ", text)

	# Hashtags removal
	text = re.sub("#[A-Za-z0-9_]+", " ", text)

	# Newline removal (\n)
	text = re.sub(r"\\n", " ",text)

	# Whitespace removal
	text = text.strip()

	# URL removal
	text = re.sub(r"http\S+", " ", text)
	text = re.sub(r"www.\S+", " ", text)

	# Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
	text = re.sub("[^A-Za-z\s']", " ", text)

	# Tokenization
	tokens = word_tokenize(text)

	# Stopwords removal
	tokens = [word for word in tokens if word not in sw]

	# Lemmatization
	tokens = [lemmatizer.lemmatize(word) for word in tokens]

	# Combining Tokens
	text = ' '.join(tokens)

	return text