ngt-ai-platform / modules /binary_classification.py
Gaetano Parente
aggiornati componenti binary classification
e33d658
raw
history blame
3.17 kB
import spacy
import re
import keras.models as models
from nltk.corpus import stopwords
import nltk
import modules.utilities.utils as utils
BASE_PATH = './data/'
VOCAB = BASE_PATH + 'vocab.txt'
MODEL = BASE_PATH + 'model/'
WEIGHTS = BASE_PATH + 'weights/'
TOKEN = BASE_PATH + 'tokenizer/'
def init():
nltk.download('stopwords')
global nlp
nlp = spacy.load("it_core_news_lg")
def load_vocab():
vocab=utils.load_doc(VOCAB)
vocab=vocab.split()
vocab = set(vocab)
return vocab
def lemma_text(text):
doc = nlp(text)
lemmatized_tokens = [token.lemma_ for token in doc]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
def splitWords(l):
for words in l:
yield from words.split()
def clean_doc(doc):
# split into tokens by white space
tokens = doc.split()
# prepare regex for char filtering
#re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
#tokens = [re_punc.sub('', w) for w in tokens]
tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens]
tokens = list(splitWords(tokens))
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('italian'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 2]
return tokens
def predict_sentiment(review, vocab, tokenizer, model):
# clean
tokens = clean_doc(review)
# filter by vocab
tokens = [w for w in tokens if w in vocab]
# convert to line
line = ' '.join(tokens)
# encode
encoded = tokenizer.texts_to_matrix([line], mode='binary')
# predict sentiment
yhat = model.predict(encoded, verbose=0)
# retrieve predicted percentage and label
percent_pos = yhat[0,0]
if round(percent_pos) == 0:
return (1-percent_pos), 'NEGATIVE'
return percent_pos, 'POSITIVE'
def predict(model_path, weights_path, tokenizer_path, text, debug) :
tokenizer = utils.load_tokenizer(tokenizer_path)
vocab = load_vocab()
doc = lemma_text(text)
model = models.load_model(model_path, compile=False)
if(weights_path != '') :
model.load_weights(weights_path, skip_mismatch=True, by_name=True)
percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model)
if(debug):
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100))
return doc, sentiment, percent
def binary_classification(text):
init()
model = MODEL + 'binary-classification.h5'
weights = ''
tokenizer = TOKEN + 'binary-classification-tokenizer.json'
try:
if(text == "") : raise Exception
except :
return {"error": "Sentence is required"}, 415
doc, sentiment, percent = predict(model, weights, tokenizer, text, False)
positive = 0
negative = 0
if(sentiment == 'POSITIVE'):
positive = percent.astype(float) * 100
negative = 100 - positive
else:
negative = percent.astype(float) * 100
positive = 100 - negative
labels = {
'positive' : "%.2f" % float(positive/100),
'negative' : "%.2f" % float(negative/100)
}
return labels