Spaces:
Sleeping
Sleeping
| import html | |
| def clear_reply_mentions(tweet): | |
| '''Remove user mentions found in a reply to a tweet. | |
| Example: @user1 @user2 okay @user3 -> okay @user3 | |
| ''' | |
| # We don't need to use any sophisticated tokenization here like nltk | |
| tokens = tweet.split(" ") | |
| for index in range(len(tokens)): | |
| if not tokens[index].startswith("@"): | |
| return " ".join(tokens[index:]) | |
| return "" | |
| from emoji import demojize, is_emoji | |
| from nltk.tokenize import TweetTokenizer | |
| tweet_tokenizer = TweetTokenizer() | |
| def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True): | |
| lowercased_token = token.lower() | |
| if token.startswith("@") and replace_user_mentions: | |
| return "@USER" | |
| elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls: | |
| return "HTTPURL" | |
| elif len(token) == 1 and is_emoji(token): | |
| emojis_found.append(token) | |
| if demojize_emojis: | |
| return demojize(token) | |
| else: | |
| return token | |
| else: | |
| if token == "’": | |
| return "'" | |
| elif token == "…": | |
| return "..." | |
| else: | |
| return token | |
| def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True): | |
| emojis_found = [] | |
| tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "...")) | |
| normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found, | |
| replace_user_mentions=replace_user_mentions, | |
| replace_urls=replace_urls, | |
| demojize_emojis=demojize_emojis) for token in tokens]) | |
| if bert_tweet_specific_processing: | |
| normTweet = ( | |
| normTweet.replace("cannot ", "can not ") | |
| .replace("n't ", " n't ") | |
| .replace("n 't ", " n't ") | |
| .replace("ca n't", "can't") | |
| .replace("ai n't", "ain't") | |
| ) | |
| normTweet = ( | |
| normTweet.replace("'m ", " 'm ") | |
| .replace("'re ", " 're ") | |
| .replace("'s ", " 's ") | |
| .replace("'ll ", " 'll ") | |
| .replace("'d ", " 'd ") | |
| .replace("'ve ", " 've ") | |
| ) | |
| normTweet = ( | |
| normTweet.replace(" p . m .", " p.m.") | |
| .replace(" p . m ", " p.m ") | |
| .replace(" a . m .", " a.m.") | |
| .replace(" a . m ", " a.m ") | |
| ) | |
| return " ".join(normTweet.split()), emojis_found | |
| def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True, | |
| demojize_emojis=True, bert_tweet_specific_processing=True): | |
| '''Helper function to clean tweets. Highly customizable to fit different needs. | |
| Params: | |
| tweet: the tweet to clean | |
| clear_html_chars: If true, will unescape any special html entities found in the tweet | |
| replace_user_mentions: If true, will replace any user mention with the token @USER | |
| replace_urls: If true, will replace any urls with the token HTTPURL | |
| demojize_emojis: If true, will demojize emojis | |
| bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model | |
| Returns: | |
| The cleaned tweet | |
| ''' | |
| # First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet). | |
| # These do not count in the character count of a tweet and may make the tweet length go way overboard. | |
| cleaned_tweet = clear_reply_mentions(tweet) | |
| # Second step: Remove any new lines | |
| cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '') | |
| # Third step: if True, escape any html entities | |
| if clear_html_chars: | |
| cleaned_tweet = html.unescape(cleaned_tweet) | |
| # Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..) | |
| normalized_tweet, emojis = normalizeTweet(cleaned_tweet, | |
| replace_user_mentions=replace_user_mentions, | |
| replace_urls=replace_urls, | |
| demojize_emojis=demojize_emojis, | |
| bert_tweet_specific_processing=bert_tweet_specific_processing) | |
| # TODO: process emoticons? e.g. :) | |
| return normalized_tweet |