Upload 2 files
Browse files- TweetNormalizer.py +59 -0
- requirements.txt +11 -0
TweetNormalizer.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from emoji import demojize
|
| 2 |
+
from nltk.tokenize import TweetTokenizer
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
tokenizer = TweetTokenizer()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def normalizeToken(token):
|
| 9 |
+
lowercased_token = token.lower()
|
| 10 |
+
if token.startswith("@"):
|
| 11 |
+
return "@USER"
|
| 12 |
+
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
|
| 13 |
+
return "HTTPURL"
|
| 14 |
+
elif len(token) == 1:
|
| 15 |
+
return demojize(token)
|
| 16 |
+
else:
|
| 17 |
+
if token == "’":
|
| 18 |
+
return "'"
|
| 19 |
+
elif token == "…":
|
| 20 |
+
return "..."
|
| 21 |
+
else:
|
| 22 |
+
return token
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def normalizeTweet(tweet):
|
| 26 |
+
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
|
| 27 |
+
normTweet = " ".join([normalizeToken(token) for token in tokens])
|
| 28 |
+
|
| 29 |
+
normTweet = (
|
| 30 |
+
normTweet.replace("cannot ", "can not ")
|
| 31 |
+
.replace("n't ", " n't ")
|
| 32 |
+
.replace("n 't ", " n't ")
|
| 33 |
+
.replace("ca n't", "can't")
|
| 34 |
+
.replace("ai n't", "ain't")
|
| 35 |
+
)
|
| 36 |
+
normTweet = (
|
| 37 |
+
normTweet.replace("'m ", " 'm ")
|
| 38 |
+
.replace("'re ", " 're ")
|
| 39 |
+
.replace("'s ", " 's ")
|
| 40 |
+
.replace("'ll ", " 'll ")
|
| 41 |
+
.replace("'d ", " 'd ")
|
| 42 |
+
.replace("'ve ", " 've ")
|
| 43 |
+
)
|
| 44 |
+
normTweet = (
|
| 45 |
+
normTweet.replace(" p . m .", " p.m.")
|
| 46 |
+
.replace(" p . m ", " p.m ")
|
| 47 |
+
.replace(" a . m .", " a.m.")
|
| 48 |
+
.replace(" a . m ", " a.m ")
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return " ".join(normTweet.split())
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
print(
|
| 56 |
+
normalizeTweet(
|
| 57 |
+
"SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
|
| 58 |
+
)
|
| 59 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
joblib
|
| 2 |
+
transformers
|
| 3 |
+
matplotlib
|
| 4 |
+
pandas
|
| 5 |
+
emoji
|
| 6 |
+
nltk
|
| 7 |
+
seaborn
|
| 8 |
+
numpy
|
| 9 |
+
torch
|
| 10 |
+
tensorflow
|
| 11 |
+
tf-keras
|