Spaces:
Running
Running
| import spacy | |
| import re | |
| import keras.models as models | |
| from nltk.corpus import stopwords | |
| import nltk | |
| import modules.utilities.utils as utils | |
| BASE_PATH = './data/' | |
| VOCAB = BASE_PATH + 'vocab.txt' | |
| MODEL = BASE_PATH + 'model/' | |
| WEIGHTS = BASE_PATH + 'weights/' | |
| TOKEN = BASE_PATH + 'tokenizer/' | |
| def init(): | |
| nltk.download('stopwords') | |
| global nlp | |
| nlp = spacy.load("it_core_news_lg") | |
| def load_vocab(): | |
| vocab=utils.load_doc(VOCAB) | |
| vocab=vocab.split() | |
| vocab = set(vocab) | |
| return vocab | |
| def lemma_text(text): | |
| doc = nlp(text) | |
| lemmatized_tokens = [token.lemma_ for token in doc] | |
| lemmatized_text = ' '.join(lemmatized_tokens) | |
| return lemmatized_text | |
| def splitWords(l): | |
| for words in l: | |
| yield from words.split() | |
| def clean_doc(doc): | |
| # split into tokens by white space | |
| tokens = doc.split() | |
| # prepare regex for char filtering | |
| #re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word | |
| #tokens = [re_punc.sub('', w) for w in tokens] | |
| tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens] | |
| tokens = list(splitWords(tokens)) | |
| # remove remaining tokens that are not alphabetic | |
| tokens = [word for word in tokens if word.isalpha()] | |
| # filter out stop words | |
| stop_words = set(stopwords.words('italian')) | |
| tokens = [w for w in tokens if not w in stop_words] | |
| # filter out short tokens | |
| tokens = [word for word in tokens if len(word) > 2] | |
| return tokens | |
| def predict_sentiment(review, vocab, tokenizer, model): | |
| # clean | |
| tokens = clean_doc(review) | |
| # filter by vocab | |
| tokens = [w for w in tokens if w in vocab] | |
| # convert to line | |
| line = ' '.join(tokens) | |
| # encode | |
| encoded = tokenizer.texts_to_matrix([line], mode='binary') | |
| # predict sentiment | |
| yhat = model.predict(encoded, verbose=0) | |
| # retrieve predicted percentage and label | |
| percent_pos = yhat[0,0] | |
| if round(percent_pos) == 0: | |
| return (1-percent_pos), 'NEGATIVE' | |
| return percent_pos, 'POSITIVE' | |
| def predict(model_path, weights_path, tokenizer_path, text, debug) : | |
| tokenizer = utils.load_tokenizer(tokenizer_path) | |
| vocab = load_vocab() | |
| doc = lemma_text(text) | |
| model = models.load_model(model_path, compile=False) | |
| if(weights_path != '') : | |
| model.load_weights(weights_path, skip_mismatch=True, by_name=True) | |
| percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model) | |
| if(debug): | |
| print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100)) | |
| return doc, sentiment, percent | |
| def binary_classification(text): | |
| init() | |
| model = MODEL + 'binary-classification.h5' | |
| weights = '' | |
| tokenizer = TOKEN + 'binary-classification-tokenizer.json' | |
| try: | |
| if(text == "") : raise Exception | |
| except : | |
| return {"error": "Sentence is required"}, 415 | |
| doc, sentiment, percent = predict(model, weights, tokenizer, text, False) | |
| positive = 0 | |
| negative = 0 | |
| if(sentiment == 'POSITIVE'): | |
| positive = percent.astype(float) * 100 | |
| negative = 100 - positive | |
| else: | |
| negative = percent.astype(float) * 100 | |
| positive = 100 - negative | |
| labels = { | |
| 'positive' : "%.2f" % float(positive/100), | |
| 'negative' : "%.2f" % float(negative/100) | |
| } | |
| return labels | |