ana-bernal commited on
Commit
2c67c5b
·
1 Parent(s): 332a5b2

Modified app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -0
app.py CHANGED
@@ -1,5 +1,262 @@
1
  import gradio as gr
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
4
  return "Hello " + name + "!!"
5
 
 
1
  import gradio as gr
2
 
3
+ # For loading files
4
+ from joblib import dump, load
5
+
6
+ # Model hub
7
+ import tensorflow_hub as hub
8
+
9
+ # Language/text
10
+ import spacy
11
+ from bs4 import BeautifulSoup
12
+ from spacy.symbols import ORTH
13
+
14
+ # for listing tags from binary sequence
15
+ from itertools import compress
16
+
17
+ #---------------------------------------
18
+
19
+ # Loading files
20
+ path = './trained_models/'
21
+ filename_model = 'multinomialNB-use.joblib'
22
+ filename_scaler = 'scaler.joblib'
23
+
24
+ # Loading model
25
+ clf = load(path + filename_model)
26
+
27
+ # Loading scaler
28
+ scaler = load(path + filename_scaler)
29
+
30
+ #------------------------------------------
31
+
32
+ # Parameters
33
+
34
+ thresh = 0.4
35
+ tag_list = ['c#',
36
+ 'java',
37
+ 'javascript',
38
+ 'python',
39
+ 'c++',
40
+ 'ios',
41
+ 'android',
42
+ '.net',
43
+ 'html',
44
+ 'php',
45
+ 'objective-c',
46
+ 'jquery',
47
+ 'c',
48
+ 'iphone',
49
+ 'sql',
50
+ 'asp.net',
51
+ 'css',
52
+ 'linux',
53
+ 'node.js',
54
+ 'performance',
55
+ 'spring',
56
+ 'windows',
57
+ 'swift',
58
+ 'xcode',
59
+ 'ruby-on-rails',
60
+ 'mysql',
61
+ 'json',
62
+ 'sql-server',
63
+ 'multithreading',
64
+ 'asp.net-mvc',
65
+ 'ruby',
66
+ 'database',
67
+ 'wpf',
68
+ 'unit-testing',
69
+ 'macos',
70
+ 'arrays',
71
+ 'c++11',
72
+ 'django']
73
+
74
+ # --------------------------------------------
75
+
76
+ # Functions
77
+
78
+ def remove_code(text):
79
+ """
80
+ Removes "<code> some text </code>" from a text.
81
+ or "<script> some text </script>"
82
+
83
+ Parameters
84
+ - text : str
85
+ """
86
+ soup = BeautifulSoup(text,'lxml')
87
+ code_to_remove = soup.findAll('code')
88
+ for code in code_to_remove:
89
+ code.replace_with(' ')
90
+
91
+ code_to_remove = soup.findAll('script')
92
+ for code in code_to_remove:
93
+ code.replace_with(' ')
94
+
95
+ return str(soup)
96
+
97
+ def instantiate_spacy():
98
+ global nlp
99
+ # Instantiating language model, english
100
+ nlp = spacy.load("en_core_web_sm")
101
+
102
+ def import_stopwords():
103
+ # Importing stopwords
104
+ with open('./stopwords/stopwords.txt') as file:
105
+ my_stopwords = {line.rstrip() for line in file}
106
+
107
+ # Adding my_stopwords to spacy stopwords
108
+ nlp.Defaults.stop_words = nlp.Defaults.stop_words.union(my_stopwords)
109
+
110
+ def clean(text,tokenize=False,strict=False, **kwargs):
111
+ """
112
+ Returns a dictionnary with keys 'text' or 'tokens', where
113
+ 'tokens' corresponds tothe list of lemmatized tokens from
114
+ the string text. Ommiting stopwords and punctuation, and the text is
115
+ the joint text.
116
+
117
+ Parameters:
118
+ - text: str
119
+ - tokenize: bool
120
+ If True returns list of tokens, if False returns string.
121
+ - strict: bool
122
+ If true only keeps nouns
123
+ """
124
+
125
+
126
+ # Removing <code>some code</code>
127
+ clean_txt = remove_code(text)
128
+
129
+ # Removing HTML tags
130
+ soup = BeautifulSoup(clean_txt, features='html.parser')
131
+ clean_txt = soup.get_text()
132
+
133
+ # Removing new line character: \n
134
+ clean_txt = clean_txt.replace('\n', ' ')
135
+
136
+ # Removing unicode characters
137
+ clean_txt = clean_txt.encode("ascii", "ignore").decode()
138
+
139
+ # Removing digits
140
+ clean_txt = ''.join(char for char in clean_txt if not char.isdigit())
141
+
142
+ # Replacing 'c ++' and 'c #' for 'c++' and 'c#' and others
143
+ clean_txt = clean_txt.replace('c ++', 'c++')
144
+ clean_txt = clean_txt.replace('c #', 'c#')
145
+ clean_txt = clean_txt.replace('C ++', 'c++')
146
+ clean_txt = clean_txt.replace('C #', 'c#')
147
+ clean_txt = clean_txt.replace('C#', 'c#')
148
+ clean_txt = clean_txt.replace('C ++', 'c++')
149
+
150
+ # Adding special case rule
151
+ special_case = [{ORTH: "c#"}]
152
+ nlp.tokenizer.add_special_case("c#", special_case)
153
+ special_case = [{ORTH: ".net"}]
154
+ nlp.tokenizer.add_special_case(".net", special_case)
155
+ special_case = [{ORTH: "objective-c"}]
156
+ nlp.tokenizer.add_special_case("objective-c", special_case)
157
+ special_case = [{ORTH: "asp.net"}]
158
+ nlp.tokenizer.add_special_case("asp.net", special_case)
159
+ special_case = [{ORTH: "node.js"}]
160
+ nlp.tokenizer.add_special_case("node.js", special_case)
161
+ special_case = [{ORTH: "ruby-on-rails"}]
162
+ nlp.tokenizer.add_special_case("ruby-on-rails", special_case)
163
+ special_case = [{ORTH: "sql-server"}]
164
+ nlp.tokenizer.add_special_case("sql-server", special_case)
165
+ special_case = [{ORTH: "unit-testing"}]
166
+ nlp.tokenizer.add_special_case("unit-testing", special_case)
167
+
168
+ # Tokenize with spacy
169
+ doc = nlp(clean_txt)
170
+
171
+ # Tokenize properties
172
+ if strict == True:
173
+ tokens = [token.lemma_.lower() for token in doc
174
+ if token.pos_ in ['NOUN', 'PROPN', 'VERB'] and
175
+ (not (token.is_stop or
176
+ token.is_punct or
177
+ token.is_space
178
+ )
179
+ )
180
+ ]
181
+ else:
182
+ tokens = [token.lemma_.lower() for token in doc
183
+ if not (token.is_stop or
184
+ token.is_punct or
185
+ token.is_space
186
+ )
187
+ ]
188
+
189
+ clean_txt = ' '.join(tokens)
190
+
191
+ # Ask if return text or tokens
192
+ if tokenize == True:
193
+ result = tokens
194
+ else:
195
+ result = clean_txt
196
+
197
+ # Option for list of entities in output
198
+ if 'ent' in kwargs:
199
+ result = {'output':result, 'ents': doc.ents}
200
+
201
+ return result
202
+
203
+ def my_pred(X):
204
+ """
205
+ Takes an embedding X obtained from the USE
206
+ model, scales it with our scaler first and
207
+ returns the prediction of our tag suggestion model in
208
+ form of a binary list.
209
+ """
210
+ # Scaling with pre-trained scaler
211
+ X_scaled = scaler.transform(X)
212
+
213
+ # Predicting probabilities, using best thresh pre-trained
214
+ y_pred_proba = clf.predict_proba(X_scaled)
215
+ y_pred = (y_pred_proba > thresh).astype(int).reshape((len(tag_list),))
216
+
217
+ return y_pred
218
+
219
+
220
+ def binary_to_tag_list(binary):
221
+ """
222
+ Converts a binary list to the list of tags (str).
223
+ """
224
+ fil = [bool(x) for x in list(binary)]
225
+ list_tags = list(compress(tag_list,fil))
226
+
227
+ return list_tags
228
+
229
+ def tag_suggestion(raw_text):
230
+ """
231
+ Returns a list of tags suggested for the question raw_text.
232
+ """
233
+ # Clean text first
234
+ clean_text = clean(raw_text)
235
+ document = [clean_text]
236
+
237
+ # Find an embedding of the text with USE
238
+ X = embed(document)
239
+
240
+ # Predict a tag set with our classification model
241
+ pred = my_pred(X)
242
+
243
+ return binary_to_tag_list(pred)
244
+
245
+ # --------------------------------------------------
246
+
247
+ # Execution
248
+
249
+ instantiate_spacy()
250
+
251
+ import_stopwords()
252
+
253
+ # Import and insantiate embedding model
254
+ embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
255
+
256
+ # --------------------------------------------------
257
+
258
+
259
+
260
  def greet(name):
261
  return "Hello " + name + "!!"
262