import spacy import re import keras.models as models from nltk.corpus import stopwords import nltk import modules.utilities.utils as utils BASE_PATH = './data/' VOCAB = BASE_PATH + 'vocab.txt' MODEL = BASE_PATH + 'model/' WEIGHTS = BASE_PATH + 'weights/' TOKEN = BASE_PATH + 'tokenizer/' def init(): nltk.download('stopwords') global nlp nlp = spacy.load("it_core_news_lg") def load_vocab(): vocab=utils.load_doc(VOCAB) vocab=vocab.split() vocab = set(vocab) return vocab def lemma_text(text): doc = nlp(text) lemmatized_tokens = [token.lemma_ for token in doc] lemmatized_text = ' '.join(lemmatized_tokens) return lemmatized_text def splitWords(l): for words in l: yield from words.split() def clean_doc(doc): # split into tokens by white space tokens = doc.split() # prepare regex for char filtering #re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word #tokens = [re_punc.sub('', w) for w in tokens] tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens] tokens = list(splitWords(tokens)) # remove remaining tokens that are not alphabetic tokens = [word for word in tokens if word.isalpha()] # filter out stop words stop_words = set(stopwords.words('italian')) tokens = [w for w in tokens if not w in stop_words] # filter out short tokens tokens = [word for word in tokens if len(word) > 2] return tokens def predict_sentiment(review, vocab, tokenizer, model): # clean tokens = clean_doc(review) # filter by vocab tokens = [w for w in tokens if w in vocab] # convert to line line = ' '.join(tokens) # encode encoded = tokenizer.texts_to_matrix([line], mode='binary') # predict sentiment yhat = model.predict(encoded, verbose=0) # retrieve predicted percentage and label percent_pos = yhat[0,0] if round(percent_pos) == 0: return (1-percent_pos), 'NEGATIVE' return percent_pos, 'POSITIVE' def predict(model_path, weights_path, tokenizer_path, text, debug) : tokenizer = utils.load_tokenizer(tokenizer_path) vocab = load_vocab() doc = lemma_text(text) model = models.load_model(model_path, compile=False) if(weights_path != '') : model.load_weights(weights_path, skip_mismatch=True, by_name=True) percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model) if(debug): print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100)) return doc, sentiment, percent def binary_classification(text): init() model = MODEL + 'binary-classification.h5' weights = '' tokenizer = TOKEN + 'binary-classification-tokenizer.json' try: if(text == "") : raise Exception except : return {"error": "Sentence is required"}, 415 doc, sentiment, percent = predict(model, weights, tokenizer, text, False) positive = 0 negative = 0 if(sentiment == 'POSITIVE'): positive = percent.astype(float) * 100 negative = 100 - positive else: negative = percent.astype(float) * 100 positive = 100 - negative labels = { 'positive' : "%.2f" % float(positive/100), 'negative' : "%.2f" % float(negative/100) } return labels