Spaces:
Runtime error
Runtime error
Fixed disappearing profanities
Browse files- app.py +9 -17
- lookup_words.txt +1 -0
app.py
CHANGED
|
@@ -31,9 +31,8 @@ obj_pronouns = read_text('obj_pronouns')
|
|
| 31 |
profanities = read_text('profanities', 'json')
|
| 32 |
|
| 33 |
|
| 34 |
-
def
|
| 35 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
| 36 |
-
obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
|
| 37 |
matches = dict()
|
| 38 |
|
| 39 |
# Loop each word in tweet
|
|
@@ -58,7 +57,7 @@ def fuzzyLookup(tweet):
|
|
| 58 |
|
| 59 |
for word, matched_profanity in matches.items():
|
| 60 |
word_split = word.split(matched_profanity[-2:])
|
| 61 |
-
for pronoun in
|
| 62 |
if len(word_split) > 1:
|
| 63 |
if pronoun == word_split[-1]:
|
| 64 |
matches[word] = matched_profanity + ' ' + pronoun
|
|
@@ -68,13 +67,12 @@ def fuzzyLookup(tweet):
|
|
| 68 |
for word, matched_profanity in matches.items():
|
| 69 |
tweet = tweet.replace(word, matched_profanity)
|
| 70 |
|
| 71 |
-
tweet_split = tweet.split()
|
| 72 |
for profanity, prof_varations in profanities.items():
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
tweet = ' '.join(tweet_split)
|
| 77 |
|
|
|
|
| 78 |
return tweet, matches
|
| 79 |
|
| 80 |
|
|
@@ -108,10 +106,6 @@ def preprocess(tweet):
|
|
| 108 |
if any(x in word for x in laugh_texts):
|
| 109 |
row_split[index] = 'haha'
|
| 110 |
|
| 111 |
-
# Remove words with digits (4ever)
|
| 112 |
-
if any(x.isdigit() for x in word):
|
| 113 |
-
row_split[index] = ''
|
| 114 |
-
|
| 115 |
# Combine list of words back to sentence
|
| 116 |
combined_text = ' '.join(filter(None, row_split))
|
| 117 |
|
|
@@ -136,9 +130,8 @@ def query(payload):
|
|
| 136 |
|
| 137 |
def predict(tweet):
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
output = query(processed_text)
|
| 142 |
|
| 143 |
if 'error' in output:
|
| 144 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
|
@@ -149,14 +142,13 @@ def predict(tweet):
|
|
| 149 |
|
| 150 |
if predicted_label == 'Abusive':
|
| 151 |
for base_word, _ in matches.items():
|
|
|
|
| 152 |
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
| 153 |
|
| 154 |
return output, tweet, json.dumps(matches)
|
| 155 |
else:
|
| 156 |
return output, tweet, json.dumps(matches)
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
| 161 |
|
| 162 |
|
|
|
|
| 31 |
profanities = read_text('profanities', 'json')
|
| 32 |
|
| 33 |
|
| 34 |
+
def fuzzy_lookup(tweet):
|
| 35 |
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
|
|
|
|
| 36 |
matches = dict()
|
| 37 |
|
| 38 |
# Loop each word in tweet
|
|
|
|
| 57 |
|
| 58 |
for word, matched_profanity in matches.items():
|
| 59 |
word_split = word.split(matched_profanity[-2:])
|
| 60 |
+
for pronoun in obj_pronouns:
|
| 61 |
if len(word_split) > 1:
|
| 62 |
if pronoun == word_split[-1]:
|
| 63 |
matches[word] = matched_profanity + ' ' + pronoun
|
|
|
|
| 67 |
for word, matched_profanity in matches.items():
|
| 68 |
tweet = tweet.replace(word, matched_profanity)
|
| 69 |
|
|
|
|
| 70 |
for profanity, prof_varations in profanities.items():
|
| 71 |
+
if len(prof_varations) > 0:
|
| 72 |
+
for prof_variant in prof_varations:
|
| 73 |
+
tweet = tweet.replace(prof_variant, profanity)
|
|
|
|
| 74 |
|
| 75 |
+
print('Fuzzy Returns:', tweet)
|
| 76 |
return tweet, matches
|
| 77 |
|
| 78 |
|
|
|
|
| 106 |
if any(x in word for x in laugh_texts):
|
| 107 |
row_split[index] = 'haha'
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
# Combine list of words back to sentence
|
| 110 |
combined_text = ' '.join(filter(None, row_split))
|
| 111 |
|
|
|
|
| 130 |
|
| 131 |
def predict(tweet):
|
| 132 |
|
| 133 |
+
fuzzy_text, matches = fuzzy_lookup(tweet)
|
| 134 |
+
output = query(preprocess(fuzzy_text))
|
|
|
|
| 135 |
|
| 136 |
if 'error' in output:
|
| 137 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
|
|
|
| 142 |
|
| 143 |
if predicted_label == 'Abusive':
|
| 144 |
for base_word, _ in matches.items():
|
| 145 |
+
|
| 146 |
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
| 147 |
|
| 148 |
return output, tweet, json.dumps(matches)
|
| 149 |
else:
|
| 150 |
return output, tweet, json.dumps(matches)
|
| 151 |
|
|
|
|
|
|
|
| 152 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
| 153 |
|
| 154 |
|
lookup_words.txt
CHANGED
|
@@ -152,4 +152,5 @@ kang
|
|
| 152 |
bubuka
|
| 153 |
buka
|
| 154 |
talaga
|
|
|
|
| 155 |
g@g0
|
|
|
|
| 152 |
bubuka
|
| 153 |
buka
|
| 154 |
talaga
|
| 155 |
+
tuloy
|
| 156 |
g@g0
|