cap / preprocessing.py
vikranth1111's picture
Upload 18 files
9e2ba5f
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
nltk.download("stopwords")
def clean_tweet(tweet:str) -> str:
"""
Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol
:param tweet: tweet by a unique user
:return: cleaned string without hashtags, emojis, and punctuation
"""
# make text lower case
tweet = tweet.lower()
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', str(tweet))
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', str(tweet))
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', str(tweet))
# remove punctuation
punct = set(string.punctuation)
tweet = "".join(ch for ch in tweet if ch not in punct)
# remove stopwords
stop_words = set(stopwords.words("english"))
tweet = " ".join(word for word in tweet.split() if word not in stop_words)
return tweet