import re import string import nltk from nltk.corpus import stopwords from nltk.tokenize import TweetTokenizer nltk.download("stopwords") def clean_tweet(tweet:str) -> str: """ Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol :param tweet: tweet by a unique user :return: cleaned string without hashtags, emojis, and punctuation """ # make text lower case tweet = tweet.lower() # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', str(tweet)) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', str(tweet)) # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet)) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', str(tweet)) # remove punctuation punct = set(string.punctuation) tweet = "".join(ch for ch in tweet if ch not in punct) # remove stopwords stop_words = set(stopwords.words("english")) tweet = " ".join(word for word in tweet.split() if word not in stop_words) return tweet