Spaces:

vikranth1111
/

cap

Configuration error

cap / preprocessing.py

Upload 18 files

9e2ba5f about 2 years ago

1.14 kB

	import re
	import string

	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import TweetTokenizer

	nltk.download("stopwords")

	def clean_tweet(tweet:str) -> str:
	"""
	Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol
	:param tweet: tweet by a unique user
	:return: cleaned string without hashtags, emojis, and punctuation
	"""
	# make text lower case
	tweet = tweet.lower()
	# remove stock market tickers like $GE
	tweet = re.sub(r'\$\w*', '', str(tweet))
	# remove old style retweet text "RT"
	tweet = re.sub(r'^RT[\s]+', '', str(tweet))
	# remove hyperlinks
	tweet = re.sub(r'https?:\/\/.[\r\n]', '', str(tweet))
	# remove hashtags
	# only removing the hash # sign from the word
	tweet = re.sub(r'#', '', str(tweet))

	# remove punctuation
	punct = set(string.punctuation)
	tweet = "".join(ch for ch in tweet if ch not in punct)

	# remove stopwords
	stop_words = set(stopwords.words("english"))
	tweet = " ".join(word for word in tweet.split() if word not in stop_words)

	return tweet