| from emoji import demojize | |
| from nltk.tokenize import TweetTokenizer | |
| tokenizer = TweetTokenizer() | |
| def normalizeToken(token): | |
| lowercased_token = token.lower() | |
| if token.startswith("@"): | |
| return "@USER" | |
| elif lowercased_token.startswith("http") or lowercased_token.startswith("www"): | |
| return "HTTPURL" | |
| elif len(token) == 1: | |
| return demojize(token) | |
| else: | |
| if token == "’": | |
| return "'" | |
| elif token == "…": | |
| return "..." | |
| else: | |
| return token | |
| def normalizeTweet(tweet): | |
| tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "...")) | |
| normTweet = " ".join([normalizeToken(token) for token in tokens]) | |
| normTweet = ( | |
| normTweet.replace("cannot ", "can not ") | |
| .replace("n't ", " n't ") | |
| .replace("n 't ", " n't ") | |
| .replace("ca n't", "can't") | |
| .replace("ai n't", "ain't") | |
| ) | |
| normTweet = ( | |
| normTweet.replace("'m ", " 'm ") | |
| .replace("'re ", " 're ") | |
| .replace("'s ", " 's ") | |
| .replace("'ll ", " 'll ") | |
| .replace("'d ", " 'd ") | |
| .replace("'ve ", " 've ") | |
| ) | |
| normTweet = ( | |
| normTweet.replace(" p . m .", " p.m.") | |
| .replace(" p . m ", " p.m ") | |
| .replace(" a . m .", " a.m.") | |
| .replace(" a . m ", " a.m ") | |
| ) | |
| return " ".join(normTweet.split()) | |
| if __name__ == "__main__": | |
| print( | |
| normalizeTweet( | |
| "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier" | |
| ) | |
| ) | |