aamirtaymoor commited on
Commit
7912ca1
·
verified ·
1 Parent(s): aad7102

Upload ml_service.py

Browse files
Files changed (1) hide show
  1. ml_service.py +396 -0
ml_service.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import json
3
+ import re
4
+ import fasttext
5
+ import pandas as pd
6
+ import spacy
7
+ from simpletransformers.ner import NERModel
8
+ from spacy.matcher import PhraseMatcher
9
+ from einstein.constants import POSITIVE_SENTIMENT_PATTERNS, LABEL_COLOR, CATEGORY_THRESHOLD
10
+ from django.conf import settings
11
+ from emoji import demojize
12
+ import unicodedata
13
+
14
+
15
+ base_directory = settings.BASE_DIR
16
+
17
+ labels_file = f"{base_directory}/ml_models/labels.json"
18
+ ner_model_directory = f"{base_directory}/ml_models/ner_model/"
19
+ sentiment_model_file = f"{base_directory}/ml_models/sentiment_model/model.ft"
20
+
21
+
22
+ class MlProcessing:
23
+ def __init__(self, comment_dict):
24
+ self.comment_dict = comment_dict
25
+ self.is_cleaned = False
26
+
27
+ def remove_prefix(self, label):
28
+ return label.split('-')[-1]
29
+
30
+ def labels_to_spans(self, tokens, labels):
31
+ spans = []
32
+ for label, group in itertools.groupby(zip(tokens, labels), key=lambda x: self.remove_prefix(x[1])):
33
+ if label == 'O':
34
+ continue
35
+
36
+ group_tokens = [t for t, _ in group]
37
+ spans.append({'label': label, 'start': group_tokens[0]['start'], 'end': group_tokens[-1]['end'],
38
+ 'n_tokens': len(group_tokens)})
39
+
40
+ return spans
41
+
42
+ def score_to_str(self, score):
43
+ if pd.isna(score):
44
+ return ''
45
+ return f'RATING_{int(score)}'
46
+
47
+ def configure_matcher(self, nlp, patterns):
48
+ matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
49
+ patterns = [nlp.make_doc(p) for p in patterns]
50
+ matcher.add('positive', patterns)
51
+ return matcher
52
+
53
+ def cleaner(self):
54
+ cleaner = ReviewsCleaner()
55
+ self.comment_dict['text'] = cleaner.clean_text(self.comment_dict['text'])
56
+ self.comment_dict['cleaned'] = True
57
+ self.is_cleaned = True
58
+
59
+ def clip(self, x, min_, max_):
60
+ if x < min_:
61
+ return min_
62
+ if x > max_:
63
+ return max_
64
+ return x
65
+
66
+ def get_score(self):
67
+ record = dict()
68
+ if "star_rating" in self.comment_dict and self.comment_dict['star_rating'] is not None and str(self.comment_dict['star_rating']).isnumeric():
69
+ record["score"] = self.clip(float(self.comment_dict['star_rating']), 0, 5)
70
+ elif 'tali_score' in self.comment_dict and self.comment_dict['tali_score'] is not None and str(self.comment_dict['tali_score']).isnumeric():
71
+ record['score'] = self.clip(float(self.comment_dict['tali_score']) // 2, 0, 5)
72
+ else:
73
+ record['score'] = None
74
+
75
+ record['score_str'] = self.score_to_str(record['score'])
76
+
77
+ return record
78
+
79
+ def reformat_output(self, data):
80
+ text = data["text"]
81
+ spans = data.get("spans", list())
82
+ new_spans = list()
83
+ previous_span_end = -1
84
+ for i, span in enumerate(spans):
85
+ span_start = span["start"]
86
+ span_end = span["end"]
87
+
88
+ # there's some unlabelled span between the last added span and present labelled span
89
+ # this would work for first span as well
90
+ if span_start != previous_span_end + 1:
91
+ new_spans.append({
92
+ "label": text[previous_span_end + 1:span_start],
93
+ "color": "",
94
+ "value": "",
95
+ "sentiment": "",
96
+ "score": None
97
+ })
98
+
99
+ # Add the present span
100
+ new_spans.append({
101
+ "label": text[span_start:span_end],
102
+ "color": LABEL_COLOR[span["label"]],
103
+ "value": span["label"],
104
+ "sentiment": span["sentiment"],
105
+ "score": span["score"]
106
+ })
107
+
108
+ previous_span_end = span_end
109
+
110
+ # If the added span is the last labelled span but there's unlabelled text remaining
111
+ # that needs to be added
112
+ if (i == len(spans) - 1) and span_end < len(text):
113
+ new_spans.append({
114
+ "label": text[span_end:],
115
+ "color": "",
116
+ "value": "",
117
+ "sentiment": "",
118
+ "score": None,
119
+ })
120
+
121
+ previous_span_end = len(text)
122
+
123
+ data.update({"spans": new_spans})
124
+
125
+ def preprocess_text(self, text):
126
+ text = text.lower()
127
+ text = re.sub('(?<=\.)\.', ' ', text)
128
+ text = text.strip().strip('. ",')
129
+ text = text.replace('\n', ' ')
130
+ text = text.replace('’', "'")
131
+ text = re.sub('\s+', ' ', text)
132
+ return text
133
+
134
+ def predict(self, model, text, category):
135
+ text = self.preprocess_text(text)
136
+ labels, probs = model.predict(text, k=2)
137
+
138
+ if labels[0] == '__label__POSITIVE':
139
+ prob = probs[0]
140
+ else:
141
+ prob = probs[1]
142
+
143
+ if prob >= CATEGORY_THRESHOLD[category]:
144
+ label = 'POSITIVE'
145
+ else:
146
+ label = 'NEGATIVE'
147
+
148
+ return {'label': label, 'score': prob}
149
+
150
+ def apply_sentiment_model(self, review_dict_entities):
151
+ nlp = spacy.load('en_core_web_sm')
152
+ sentence_finder = SentenceBoundsFinder(nlp)
153
+ positive_sentiment_matcher = self.configure_matcher(nlp, POSITIVE_SENTIMENT_PATTERNS)
154
+ sentiment_model = self.load_sentiment_model()
155
+ if self.comment_dict['skip']:
156
+ return self.comment_dict
157
+
158
+ review = re.sub(r'["“”]|_x000D_', ' ', self.comment_dict['text'])
159
+ sentence_bounds = sentence_finder(review)
160
+ for span in self.comment_dict.get('spans', []):
161
+ segment_text = self.comment_dict['text'][span['start']:span['end']].replace('\n', ' ')
162
+ segment_doc = nlp(segment_text)
163
+ matches = positive_sentiment_matcher(segment_doc)
164
+
165
+ if matches:
166
+ sentiments = {'label': 'POSITIVE', 'score': 1.}
167
+ span['sentiment'] = sentiments.get('label')
168
+ span['score'] = sentiments.get('score')
169
+ else:
170
+ span_start = self.get_sentence_start(sentence_bounds, span['start'])
171
+ text = self.comment_dict['text'][span_start:span['end']].replace('\n', ' ')
172
+ text = f"{self.comment_dict['score_str'].lower()} {span['label'].lower()} {text}"
173
+ sentiments = self.predict(sentiment_model, text, span['label'])
174
+ span['sentiment'] = sentiments.get('label')
175
+ span['score'] = sentiments.get('score')
176
+ print(f"Sentiments : {sentiments}")
177
+ return self.comment_dict
178
+
179
+ def load_sentiment_model(self):
180
+ return fasttext.load_model(sentiment_model_file)
181
+
182
+ def get_sentence_start(self, sentence_bounds, position):
183
+ for start, end in sentence_bounds:
184
+ if start <= position <= end:
185
+ return start
186
+
187
+ raise RuntimeError('Failed to get sentence bound')
188
+
189
+ def load_ner_model(self, max_seq_len=500, use_multiprocessing=False):
190
+ args = {'overwrite_output_dir': False, 'reprocess_input_data': True, 'num_train_epochs': 30,
191
+ 'evaluation_strategy': 'epoch', 'evaluate_during_training': True, 'silent': True,
192
+ 'max_seq_length': max_seq_len, 'use_multiprocessing': use_multiprocessing,
193
+ 'use_multiprocessing_for_evaluation': use_multiprocessing, 'fp16': True}
194
+
195
+ with open(labels_file) as f:
196
+ labels = json.load(f)
197
+
198
+ return NERModel('longformer', ner_model_directory, args=args, use_cuda=False, labels=labels)
199
+
200
+ def apply_ner_model(self):
201
+ nlp = spacy.load('en_core_web_sm')
202
+ nlp.add_pipe('sentencizer')
203
+
204
+ regex = re.compile('(\(original.{0,3}\).+)', re.IGNORECASE | re.MULTILINE | re.DOTALL)
205
+ if self.comment_dict['skip']:
206
+ return self.comment_dict
207
+
208
+ self.comment_dict['text'] = regex.sub('', self.comment_dict['text'])
209
+ self.comment_dict['_doc'] = nlp(self.comment_dict['text'])
210
+
211
+ seq_lengths = [len(self.comment_dict['_doc'])]
212
+ seq_lengths = sorted(seq_lengths)
213
+
214
+ len_1 = seq_lengths[int(len(seq_lengths) * 0.8)]
215
+ len_2 = seq_lengths[-1]
216
+
217
+ ner_model_1 = self.load_ner_model(int(1.5 * len_1))
218
+ ner_model_2 = self.load_ner_model(int(1.5 * len_2))
219
+ try:
220
+ model = ner_model_1
221
+ if len(self.comment_dict['_doc']) > len_1:
222
+ model = ner_model_2
223
+ self._apply_ner_model(model, self.comment_dict)
224
+ return self.comment_dict
225
+ except Exception as e:
226
+ self.comment_dict['skip'] = True
227
+
228
+ def _apply_ner_model(self, ner_model, item):
229
+ doc = item['_doc']
230
+ del item['_doc']
231
+
232
+ predictions, _ = ner_model.predict([[t.text for t in doc]], split_on_space=False)
233
+ predictions = predictions[0]
234
+
235
+ tokens = doc.to_json()['tokens']
236
+ if len(tokens) != len(predictions):
237
+ # set_failed(db, task, 'Failed to apply NER model.')
238
+ item['spans'] = []
239
+ return
240
+
241
+ for t, p in zip(tokens, predictions):
242
+ t['label'] = list(p.values())[0]
243
+
244
+ labels = [t['label'] for t in tokens]
245
+
246
+ spans = self.labels_to_spans(tokens, labels)
247
+ item['spans'] = self.postprocess_spans(spans)
248
+
249
+ def postprocess_spans(self, spans):
250
+ if spans:
251
+ for j, span in enumerate(list(spans)):
252
+ if span['n_tokens'] < 3:
253
+ if len(spans) > 1:
254
+ if j == 0:
255
+ spans[j]['label'] = spans[j + 1]['label']
256
+ elif j == len(spans) - 1:
257
+ spans[j]['label'] = spans[j - 1]['label']
258
+ elif spans[j - 1]['label'] == spans[j + 1]['label']:
259
+ spans[j]['label'] = spans[j - 1]['label']
260
+ else:
261
+ spans[j]['label'] = 'O'
262
+ else:
263
+ spans[j]['label'] = 'O'
264
+
265
+ new_spans = []
266
+ for label, label_spans in itertools.groupby(spans, key=lambda s: s['label']):
267
+ if label == 'O':
268
+ continue
269
+
270
+ label_spans = list(label_spans)
271
+
272
+ new_spans.append({'start': label_spans[0]['start'], 'end': label_spans[-1]['end'], 'label': label})
273
+
274
+ return new_spans
275
+
276
+ def process_comment(self):
277
+ sentiment = dict()
278
+ score_dict = self.get_score()
279
+ self.comment_dict.update(score_dict)
280
+ self.cleaner()
281
+ try:
282
+ review_dict_entities = self.apply_ner_model()
283
+ sentiment = self.apply_sentiment_model(review_dict_entities)
284
+ self.reformat_output(sentiment)
285
+ # for very small texts ner model errors
286
+ except AssertionError:
287
+ self.comment_dict["skip"] = True
288
+ sentiment.update(self.comment_dict)
289
+ # sentiment.update({"spans": [{"label": review_json_cleaned["text"], "color": "", "value": "", "sentiment": "", "score": None}]})
290
+ label_color_mappings = list()
291
+ for label, label_color in LABEL_COLOR.items():
292
+ label_color_mappings.append({"label": label, "color": label_color})
293
+ sentiment.update({"color_map": label_color_mappings})
294
+ return sentiment
295
+
296
+ def main(self):
297
+ return self.process_comment()
298
+
299
+
300
+ class SentenceBoundsFinder:
301
+ def __init__(self, nlp=None):
302
+ self._nlp = nlp or spacy.load('en_core_web_sm')
303
+ self._nlp.add_pipe('sentencizer')
304
+
305
+ def __call__(self, text):
306
+ bounds = []
307
+
308
+ for sent in self._nlp(text).sents:
309
+ bounds.append((sent.start_char, sent.end_char))
310
+
311
+ return bounds
312
+
313
+
314
+ class ReviewsCleaner:
315
+ """
316
+ Class for the cleaning of review dataset and collecting statistics on cleaning
317
+ :param replace_emojis: Replace emojis to text representing them
318
+ :param unicode_normalize: Normalize unicode chars
319
+ :param remove_non_regular_chars: Remove chars with ordinal number <128
320
+ :param remove_junk: Remove characters that are not relevant for the reviews and often corrupt tokens (* \n \r \t)
321
+ :param remove_double_spaces: Remove double spaces
322
+ :param remove_boundary_quotes: Remove quotes which on boundaries of text
323
+ :param same_quotes: Transform all quote marks into single quote mark
324
+ """
325
+
326
+ def __init__(self, replace_emojis=True, unicode_normalize=True, remove_non_regular_chars=True, remove_junk=True,
327
+ remove_double_spaces=True, remove_boundary_quotes=True, same_quotes=True):
328
+ self.methods = []
329
+ # Add new methods here !!! MIND THE ORDER !!!
330
+ if replace_emojis:
331
+ self.methods.append(('Deemojize', lambda text: self.__demojize(text)))
332
+ if unicode_normalize:
333
+ self.methods.append(('Normalize', lambda text: ''.join(
334
+ c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')))
335
+ if same_quotes:
336
+ self.methods.append(('Same quotes', lambda text: re.sub('"|’|`|“', '\'', text)))
337
+ if remove_boundary_quotes:
338
+ self.methods.append(('Rm boundary quotes', lambda text: self.__remove_boundary(text)))
339
+ if remove_junk:
340
+ self.methods.append(('Remove junk', lambda text: re.sub('\*|\n|\r|\t|_x000D_', ' ', text)))
341
+ if remove_non_regular_chars:
342
+ self.methods.append(('Remove non-regular', lambda text: ''.join(c for c in text if ord(c) < 128)))
343
+ if remove_double_spaces:
344
+ self.methods.append(('Remove double spaces', lambda text: ' '.join(text.split())))
345
+ self.stats = {name: [0, 0] for name, _ in self.methods} # name, characters changed, reviews affected
346
+ self.analyzed_reviews = 0
347
+ self.skipped = 0
348
+
349
+ def clean_stats(self):
350
+ """Reset statistics"""
351
+ self.stats = {[name, 0, 0] for name, _ in self.methods}
352
+ self.analyzed_reviews = 0
353
+
354
+ def print_stats(self):
355
+ """Print statistics of used methods"""
356
+ print(f'Reviews analyzed: {self.analyzed_reviews}')
357
+ print("{:<20} {:<10} {:<10}".format('Name', 'Avg. % of chars', '% of reviews affected'))
358
+ for name, item in self.stats.items():
359
+ print("{:<20} {:<10} {:<10}".format(name, f'{(100 * item[0] / self.analyzed_reviews):.2f}%',
360
+ f'{(100 * item[1] / self.analyzed_reviews):.2f}%'))
361
+ print(f'Language skip\t-\t{(100 * self.skipped / self.analyzed_reviews):.2f}%')
362
+
363
+ def clean_text(self, text):
364
+ """Clean line of text"""
365
+ self.analyzed_reviews += 1
366
+ if len(text) == 0:
367
+ return text
368
+
369
+ for method_name, method_fun in self.methods:
370
+ text = method_fun(text)
371
+ return text
372
+
373
+ @staticmethod
374
+ def __demojize(text):
375
+ text = demojize(text, delimiters=[' ', ' '])
376
+ text = re.sub('_[a-z]*_skin_tone', '', text)
377
+ return text
378
+
379
+ @staticmethod
380
+ def __remove_boundary(text):
381
+ if text[:1] == '\'':
382
+ text = text[1:]
383
+ if text[-1:] == '\'':
384
+ text = text[:-1]
385
+ return text
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+