Spaces:
Runtime error
Runtime error
Fixed word lookup including emojis
Browse files
app.py
CHANGED
|
@@ -5,16 +5,12 @@ import re
|
|
| 5 |
import json
|
| 6 |
from thefuzz import process, fuzz
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
| 11 |
headers = {"Authorization": "Bearer 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"}
|
| 12 |
|
| 13 |
-
profanities = ['bobo', 'bwiset','gago', 'kupal',
|
| 14 |
-
'pakshet', 'pakyu', 'pucha',
|
| 15 |
-
'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
|
| 16 |
-
'tarantado', 'ulol']
|
| 17 |
-
|
| 18 |
def read_text(filename, filetype='txt'):
|
| 19 |
words = []
|
| 20 |
|
|
@@ -42,6 +38,8 @@ def fuzzyLookup(tweet):
|
|
| 42 |
|
| 43 |
# Loop each word in tweet
|
| 44 |
for word in tweet.split():
|
|
|
|
|
|
|
| 45 |
scores = []
|
| 46 |
matched_words = []
|
| 47 |
# If word > 4 chars
|
|
@@ -77,24 +75,24 @@ def fuzzyLookup(tweet):
|
|
| 77 |
tweet_split[i] = profanity
|
| 78 |
tweet = ' '.join(tweet_split)
|
| 79 |
|
| 80 |
-
return tweet,
|
| 81 |
|
| 82 |
|
| 83 |
-
def preprocess(
|
| 84 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
| 85 |
symbols = ['@', '#']
|
| 86 |
|
| 87 |
# Lowercase
|
| 88 |
-
|
| 89 |
|
| 90 |
# Remove emojis
|
| 91 |
-
|
| 92 |
|
| 93 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
| 94 |
-
|
| 95 |
|
| 96 |
# Split sentence into list of words
|
| 97 |
-
row_split =
|
| 98 |
|
| 99 |
for index, word in enumerate(row_split):
|
| 100 |
|
|
@@ -136,32 +134,27 @@ def query(payload):
|
|
| 136 |
return response.json()
|
| 137 |
|
| 138 |
|
| 139 |
-
def predict(
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
| 143 |
|
| 144 |
if 'error' in output:
|
| 145 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
| 146 |
else:
|
| 147 |
output = [tuple(i.values()) for i in output[0]]
|
| 148 |
output = dict((x, y) for x, y in output)
|
| 149 |
-
|
| 150 |
predicted_label = list(output.keys())[0]
|
| 151 |
|
| 152 |
if predicted_label == 'Abusive':
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
for i in profanity:
|
| 158 |
-
mask += "*" if i != " " else " "
|
| 159 |
-
output_text = compiled.sub(mask, output_text)
|
| 160 |
-
return output, output_text, matches
|
| 161 |
else:
|
| 162 |
-
return output,
|
| 163 |
-
|
| 164 |
-
# TODO gag0 not appearing
|
| 165 |
|
| 166 |
|
| 167 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
|
@@ -174,7 +167,7 @@ demo = gr.Interface(
|
|
| 174 |
|
| 175 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
| 176 |
gr.components.Text(label='OUTPUT'),
|
| 177 |
-
gr.components.JSON()],
|
| 178 |
|
| 179 |
examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
|
| 180 |
'Napakainit ngayong araw pakshet namaaan!!',
|
|
|
|
| 5 |
import json
|
| 6 |
from thefuzz import process, fuzz
|
| 7 |
import numpy as np
|
| 8 |
+
import re
|
| 9 |
|
| 10 |
|
| 11 |
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
|
| 12 |
headers = {"Authorization": "Bearer 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"}
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def read_text(filename, filetype='txt'):
|
| 15 |
words = []
|
| 16 |
|
|
|
|
| 38 |
|
| 39 |
# Loop each word in tweet
|
| 40 |
for word in tweet.split():
|
| 41 |
+
# Only get digits and letters
|
| 42 |
+
word = re.sub("[^a-zA-Z0-9@]", "", word)
|
| 43 |
scores = []
|
| 44 |
matched_words = []
|
| 45 |
# If word > 4 chars
|
|
|
|
| 75 |
tweet_split[i] = profanity
|
| 76 |
tweet = ' '.join(tweet_split)
|
| 77 |
|
| 78 |
+
return tweet, matches
|
| 79 |
|
| 80 |
|
| 81 |
+
def preprocess(tweet):
|
| 82 |
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
|
| 83 |
symbols = ['@', '#']
|
| 84 |
|
| 85 |
# Lowercase
|
| 86 |
+
tweet = tweet.lower()
|
| 87 |
|
| 88 |
# Remove emojis
|
| 89 |
+
tweet = emoji.replace_emoji(tweet, replace='')
|
| 90 |
|
| 91 |
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
|
| 92 |
+
tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
|
| 93 |
|
| 94 |
# Split sentence into list of words
|
| 95 |
+
row_split = tweet.split()
|
| 96 |
|
| 97 |
for index, word in enumerate(row_split):
|
| 98 |
|
|
|
|
| 134 |
return response.json()
|
| 135 |
|
| 136 |
|
| 137 |
+
def predict(tweet):
|
| 138 |
+
|
| 139 |
+
fuzz_text, matches = fuzzyLookup(tweet)
|
| 140 |
+
processed_text = preprocess(fuzz_text)
|
| 141 |
+
output = query(processed_text)
|
| 142 |
|
| 143 |
if 'error' in output:
|
| 144 |
return output['error'], 'Error occured. Try again later.', {"error": "error"}
|
| 145 |
else:
|
| 146 |
output = [tuple(i.values()) for i in output[0]]
|
| 147 |
output = dict((x, y) for x, y in output)
|
|
|
|
| 148 |
predicted_label = list(output.keys())[0]
|
| 149 |
|
| 150 |
if predicted_label == 'Abusive':
|
| 151 |
+
for base_word, _ in matches.items():
|
| 152 |
+
tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
|
| 153 |
+
|
| 154 |
+
return output, tweet, json.dumps(matches)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
else:
|
| 156 |
+
return output, tweet, json.dumps(matches)
|
| 157 |
+
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
|
|
|
|
| 167 |
|
| 168 |
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
|
| 169 |
gr.components.Text(label='OUTPUT'),
|
| 170 |
+
gr.components.JSON(label='DETECTED PROFANITIES')],
|
| 171 |
|
| 172 |
examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
|
| 173 |
'Napakainit ngayong araw pakshet namaaan!!',
|