mahidher commited on
Commit
5a64e97
·
1 Parent(s): a640291

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -0
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import matplotlib.pyplot as plt
5
+ from tensorflow import keras
6
+ from keras.preprocessing.text import Tokenizer
7
+ from keras.utils import pad_sequences
8
+ import nltk
9
+ from nltk.corpus import stopwords
10
+ import pickle
11
+ from nltk.tokenize import word_tokenize
12
+ import re
13
+ from sklearn.model_selection import train_test_split
14
+ from nltk.tokenize import word_tokenize
15
+
16
+ import gradio as gr
17
+
18
+ nltk.download('stopwords')
19
+
20
+ nltk.download('punkt')
21
+ nltk.download('wordnet')
22
+ nltk.download('omw-1.4')
23
+
24
+ print("hello")
25
+
26
+ with open('comment_tokenizer.pkl', 'rb') as file:
27
+
28
+ # Call load method to deserialze
29
+ tokenizer = pickle.load(file)
30
+
31
+
32
+ max_len = 1348
33
+
34
+ model = keras.models.load_model('comment_toxicity_model.h5')
35
+
36
+ CONTRACTION_MAP = {
37
+ "ain't": "is not",
38
+ "aren't": "are not",
39
+ "can't": "cannot",
40
+ "can't've": "cannot have",
41
+ "'cause": "because",
42
+ "could've": "could have",
43
+ "couldn't": "could not",
44
+ "couldn't've": "could not have",
45
+ "didn't": "did not",
46
+ "doesn't": "does not",
47
+ "don't": "do not",
48
+ "hadn't": "had not",
49
+ "hadn't've": "had not have",
50
+ "hasn't": "has not",
51
+ "haven't": "have not",
52
+ "he'd": "he would",
53
+ "he'd've": "he would have",
54
+ "he'll": "he will",
55
+ "he'll've": "he he will have",
56
+ "he's": "he is",
57
+ "how'd": "how did",
58
+ "how'd'y": "how do you",
59
+ "how'll": "how will",
60
+ "how's": "how is",
61
+ "i'd": "i would",
62
+ "i'd've": "i would have",
63
+ "i'll": "i will",
64
+ "i'll've": "i will have",
65
+ "i'm": "i am",
66
+ "i've": "i have",
67
+ "isn't": "is not",
68
+ "it'd": "it would",
69
+ "it'd've": "it would have",
70
+ "it'll": "it will",
71
+ "it'll've": "it will have",
72
+ "it's": "it is",
73
+ "let's": "let us",
74
+ "ma'am": "madam",
75
+ "mayn't": "may not",
76
+ "might've": "might have",
77
+ "mightn't": "might not",
78
+ "mightn't've": "might not have",
79
+ "must've": "must have",
80
+ "mustn't": "must not",
81
+ "mustn't've": "must not have",
82
+ "needn't": "need not",
83
+ "needn't've": "need not have",
84
+ "o'clock": "of the clock",
85
+ "oughtn't": "ought not",
86
+ "oughtn't've": "ought not have",
87
+ "shan't": "shall not",
88
+ "sha'n't": "shall not",
89
+ "shan't've": "shall not have",
90
+ "she'd": "she would",
91
+ "she'd've": "she would have",
92
+ "she'll": "she will",
93
+ "she'll've": "she will have",
94
+ "she's": "she is",
95
+ "should've": "should have",
96
+ "shouldn't": "should not",
97
+ "shouldn't've": "should not have",
98
+ "so've": "so have",
99
+ "so's": "so as",
100
+ "that'd": "that would",
101
+ "that'd've": "that would have",
102
+ "that's": "that is",
103
+ "there'd": "there would",
104
+ "there'd've": "there would have",
105
+ "there's": "there is",
106
+ "they'd": "they would",
107
+ "they'd've": "they would have",
108
+ "they'll": "they will",
109
+ "they'll've": "they will have",
110
+ "they're": "they are",
111
+ "they've": "they have",
112
+ "to've": "to have",
113
+ "wasn't": "was not",
114
+ "we'd": "we would",
115
+ "we'd've": "we would have",
116
+ "we'll": "we will",
117
+ "we'll've": "we will have",
118
+ "we're": "we are",
119
+ "we've": "we have",
120
+ "weren't": "were not",
121
+ "what'll": "what will",
122
+ "what'll've": "what will have",
123
+ "what're": "what are",
124
+ "what's": "what is",
125
+ "what've": "what have",
126
+ "when's": "when is",
127
+ "when've": "when have",
128
+ "where'd": "where did",
129
+ "where's": "where is",
130
+ "where've": "where have",
131
+ "who'll": "who will",
132
+ "who'll've": "who will have",
133
+ "who's": "who is",
134
+ "who've": "who have",
135
+ "why's": "why is",
136
+ "why've": "why have",
137
+ "will've": "will have",
138
+ "won't": "will not",
139
+ "won't've": "will not have",
140
+ "would've": "would have",
141
+ "wouldn't": "would not",
142
+ "wouldn't've": "would not have",
143
+ "y'all": "you all",
144
+ "y'all'd": "you all would",
145
+ "y'all'd've": "you all would have",
146
+ "y'all're": "you all are",
147
+ "y'all've": "you all have",
148
+ "you'd": "you would",
149
+ "you'd've": "you would have",
150
+ "you'll": "you will",
151
+ "you'll've": "you will have",
152
+ "you're": "you are",
153
+ "you've": "you have",
154
+ }
155
+
156
+ def expand_contractions(sentences):
157
+ contractions_re = re.compile('(%s)'%'|'.join(CONTRACTION_MAP.keys()))
158
+ def exp_cont(s, contractions_dict=CONTRACTION_MAP):
159
+ def replace(match):
160
+ return contractions_dict[match.group(0)]
161
+ return contractions_re.sub(replace, s)
162
+ for i in range(len(sentences)):
163
+ sentences[i] = exp_cont(sentences[i])
164
+
165
+
166
+ def remove_newlines_and_tabs(sentences):
167
+
168
+ for i in range(len(sentences)):
169
+ sentences[i] = sentences[i].replace('\n',' ').replace('\t',' ').replace('\\', ' ')
170
+
171
+ stoplist = set(stopwords.words('english'))
172
+
173
+ def remove_stopwords(sentences):
174
+ for i in range(len(sentences)):
175
+ tokens = word_tokenize(sentences[i])
176
+
177
+ filtered_tokens = [token for token in tokens if token.lower() not in stoplist]
178
+ sentences[i] = " ".join(filtered_tokens)
179
+
180
+
181
+ w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
182
+ lemmatizer = nltk.stem.WordNetLemmatizer()
183
+
184
+
185
+ def lemmetization(sentences):
186
+ for i in range(len(sentences)):
187
+ lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(sentences[i])]
188
+
189
+ sentences[i] = " ".join(lemma)
190
+
191
+
192
+ def score_comment(comment):
193
+ sentences = [comment]
194
+ expand_contractions(sentences)
195
+ remove_newlines_and_tabs(sentences)
196
+ remove_stopwords(sentences)
197
+ lemmetization(sentences)
198
+ tokenized = tokenizer.texts_to_sequences(sentences)
199
+ padded = pad_sequences(tokenized,maxlen=max_len,padding = 'post')
200
+ results = model.predict(padded)
201
+
202
+ text = ''
203
+ for idx, col in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
204
+ 'identity_hate']):
205
+ text += '{}: {}\n'.format(col, results[0][idx]>0.5)
206
+ print(text)
207
+ return text
208
+
209
+ # text = 'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'
210
+ # score_comment(text)
211
+
212
+ interface = gr.Interface(fn=score_comment,
213
+ inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
214
+ outputs='text')
215
+
216
+ interface.launch(share=True)