Spaces:
Runtime error
Runtime error
Profanity with hashtag detection
Browse files- app.py +64 -40
- contractions.json +4 -1
app.py
CHANGED
|
@@ -7,8 +7,7 @@ from thefuzz import process, fuzz
|
|
| 7 |
import numpy as np
|
| 8 |
import re
|
| 9 |
import nltk
|
| 10 |
-
|
| 11 |
-
from nltk.corpus import words
|
| 12 |
|
| 13 |
|
| 14 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
|
|
@@ -36,58 +35,86 @@ def read_text(filename, filetype='txt'):
|
|
| 36 |
contractions = read_text('contractions', 'json')
|
| 37 |
similar_words = read_text('similar_words')
|
| 38 |
addon_words = read_text('addon_words')
|
| 39 |
-
|
| 40 |
-
lookup_profanity = np.concatenate([np.hstack(list(
|
| 41 |
-
lookup_words = list(set(similar_words).union(set(lookup_profanity
|
| 42 |
-
eng_words = list(
|
| 43 |
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
|
| 44 |
|
| 45 |
-
# TODO check eng words that are tagalog profanities
|
| 46 |
-
|
| 47 |
def fuzzy_lookup(tweet):
|
| 48 |
|
| 49 |
matched_profanity = dict()
|
| 50 |
|
| 51 |
-
# tweet = punctuations.sub('', tweet).lower()
|
| 52 |
-
|
| 53 |
for word in tweet.split():
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
word = punctuations.sub('', word).lower()
|
|
|
|
|
|
|
| 56 |
base_word = word
|
|
|
|
|
|
|
| 57 |
word = re.sub(r'(.)\1{2,}', r'\1', word)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
for addon in addon_words:
|
| 63 |
if word.startswith(addon):
|
| 64 |
word = word[len(addon):]
|
| 65 |
if word.endswith(addon):
|
| 66 |
word = word[:-len(addon)]
|
| 67 |
|
| 68 |
-
if word
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
for lookup_word in lookup_words:
|
| 77 |
-
score = fuzz.ratio(word, lookup_word)
|
| 78 |
-
if score >= 70:
|
| 79 |
-
scores.append(score)
|
| 80 |
-
matched_words.append(lookup_word)
|
| 81 |
-
if len(scores) > 0:
|
| 82 |
-
max_score_index = np.argmax(scores)
|
| 83 |
-
if matched_words[max_score_index] in lookup_profanity:
|
| 84 |
-
for base_profanity, profanity_variations in profanities.items():
|
| 85 |
-
if matched_words[max_score_index] == base_profanity:
|
| 86 |
-
matched_profanity[base_word] = base_profanity
|
| 87 |
-
break
|
| 88 |
-
if matched_words[max_score_index] in profanity_variations:
|
| 89 |
-
matched_profanity[base_word] = base_profanity
|
| 90 |
-
break
|
| 91 |
|
| 92 |
return matched_profanity
|
| 93 |
|
|
@@ -108,11 +135,6 @@ def preprocess(tweet, profanities):
|
|
| 108 |
|
| 109 |
for index, word in enumerate(row_split):
|
| 110 |
|
| 111 |
-
# Seperate pronouns
|
| 112 |
-
for addon in addon_words:
|
| 113 |
-
if word.endswith(addon):
|
| 114 |
-
row_split[index] = word[:-len(addon)] + " " + addon
|
| 115 |
-
|
| 116 |
# Remove links
|
| 117 |
if 'http' in word:
|
| 118 |
row_split[index] = ''
|
|
@@ -150,9 +172,11 @@ def predict(tweet):
|
|
| 150 |
print(prediction)
|
| 151 |
error_message = prediction['error']
|
| 152 |
return error_message, {}
|
|
|
|
| 153 |
prediction = prediction[0][0]["label"]
|
| 154 |
|
| 155 |
print("\nTWEET:", tweet)
|
|
|
|
| 156 |
print("DETECTED PROFANITY:", list(profanities.keys()))
|
| 157 |
print("LABEL:", prediction, "\n")
|
| 158 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import re
|
| 9 |
import nltk
|
| 10 |
+
from english_words import get_english_words_set
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
|
|
|
|
| 35 |
contractions = read_text('contractions', 'json')
|
| 36 |
similar_words = read_text('similar_words')
|
| 37 |
addon_words = read_text('addon_words')
|
| 38 |
+
profanities_dict = read_text('profanities', 'json')
|
| 39 |
+
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
|
| 40 |
+
lookup_words = list(set(similar_words).union(set(lookup_profanity)))
|
| 41 |
+
eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
|
| 42 |
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
|
| 43 |
|
|
|
|
|
|
|
| 44 |
def fuzzy_lookup(tweet):
|
| 45 |
|
| 46 |
matched_profanity = dict()
|
| 47 |
|
|
|
|
|
|
|
| 48 |
for word in tweet.split():
|
| 49 |
|
| 50 |
+
if word in eng_words:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
scores = []
|
| 54 |
+
matched_words = []
|
| 55 |
+
matched_word = None
|
| 56 |
+
|
| 57 |
+
# Remove trailing punctuations except # and @
|
| 58 |
word = punctuations.sub('', word).lower()
|
| 59 |
+
|
| 60 |
+
# Save base word
|
| 61 |
base_word = word
|
| 62 |
+
|
| 63 |
+
# Shortent elongated word
|
| 64 |
word = re.sub(r'(.)\1{2,}', r'\1', word)
|
| 65 |
+
|
| 66 |
+
# Remove # and @
|
| 67 |
+
if word.startswith("#") or word.startswith("@"):
|
| 68 |
+
word = word[1:]
|
| 69 |
+
|
| 70 |
+
# Remove trailing words (mo, ka, pinaka)
|
| 71 |
for addon in addon_words:
|
| 72 |
if word.startswith(addon):
|
| 73 |
word = word[len(addon):]
|
| 74 |
if word.endswith(addon):
|
| 75 |
word = word[:-len(addon)]
|
| 76 |
|
| 77 |
+
if len(word) < 4:
|
| 78 |
+
continue
|
| 79 |
|
| 80 |
+
# Get fuzzy ratio
|
| 81 |
+
for lookup_word in lookup_words:
|
| 82 |
+
|
| 83 |
+
score = fuzz.ratio(word, lookup_word)
|
| 84 |
+
|
| 85 |
+
# Threshold
|
| 86 |
+
if score >= 70:
|
| 87 |
+
scores.append(score)
|
| 88 |
+
matched_words.append(lookup_word)
|
| 89 |
+
|
| 90 |
+
if len(scores) == 0:
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
if len(set(scores)) == 1:
|
| 94 |
+
for matched_word in matched_words:
|
| 95 |
+
if matched_word in lookup_profanity:
|
| 96 |
+
matched_word = matched_word
|
| 97 |
+
break
|
| 98 |
+
else:
|
| 99 |
+
# Get matched word with max score
|
| 100 |
+
max_score_index = np.argmax(scores)
|
| 101 |
+
matched_word = matched_words[max_score_index]
|
| 102 |
+
|
| 103 |
+
if matched_word not in lookup_profanity:
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
for base_profanity, profanity_variations in profanities_dict.items():
|
| 107 |
+
|
| 108 |
+
if matched_word in profanity_variations or matched_word == base_profanity:
|
| 109 |
+
|
| 110 |
+
# Seperate pronouns
|
| 111 |
+
for addon in addon_words:
|
| 112 |
+
if base_word.endswith(addon):
|
| 113 |
+
base_profanity = base_profanity + " " + addon
|
| 114 |
+
break
|
| 115 |
|
| 116 |
+
matched_profanity[base_word] = base_profanity
|
| 117 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
return matched_profanity
|
| 120 |
|
|
|
|
| 135 |
|
| 136 |
for index, word in enumerate(row_split):
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
# Remove links
|
| 139 |
if 'http' in word:
|
| 140 |
row_split[index] = ''
|
|
|
|
| 172 |
print(prediction)
|
| 173 |
error_message = prediction['error']
|
| 174 |
return error_message, {}
|
| 175 |
+
|
| 176 |
prediction = prediction[0][0]["label"]
|
| 177 |
|
| 178 |
print("\nTWEET:", tweet)
|
| 179 |
+
print("PROCESSED TWEET:", preprocessed_tweet)
|
| 180 |
print("DETECTED PROFANITY:", list(profanities.keys()))
|
| 181 |
print("LABEL:", prediction, "\n")
|
| 182 |
|
contractions.json
CHANGED
|
@@ -29,5 +29,8 @@
|
|
| 29 |
"kelan": "kailan",
|
| 30 |
"raw": "daw",
|
| 31 |
"itong": "ito ang",
|
| 32 |
-
"lng": "lang"
|
|
|
|
|
|
|
|
|
|
| 33 |
}
|
|
|
|
| 29 |
"kelan": "kailan",
|
| 30 |
"raw": "daw",
|
| 31 |
"itong": "ito ang",
|
| 32 |
+
"lng": "lang",
|
| 33 |
+
"putang ina": "putangina",
|
| 34 |
+
"tangina" : "tangina",
|
| 35 |
+
"inamo" : "ina mo"
|
| 36 |
}
|