Spaces:
Running
Running
File size: 3,173 Bytes
2ff9250 5d81264 2ff9250 40ba4b5 2ff9250 e33d658 2ff9250 e33d658 2ff9250 d4c306a 2ff9250 c469f55 2ff9250 a9d51d0 2ff9250 db62dd1 2ff9250 db62dd1 2ff9250 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import spacy
import re
import keras.models as models
from nltk.corpus import stopwords
import nltk
import modules.utilities.utils as utils
BASE_PATH = './data/'
VOCAB = BASE_PATH + 'vocab.txt'
MODEL = BASE_PATH + 'model/'
WEIGHTS = BASE_PATH + 'weights/'
TOKEN = BASE_PATH + 'tokenizer/'
def init():
nltk.download('stopwords')
global nlp
nlp = spacy.load("it_core_news_lg")
def load_vocab():
vocab=utils.load_doc(VOCAB)
vocab=vocab.split()
vocab = set(vocab)
return vocab
def lemma_text(text):
doc = nlp(text)
lemmatized_tokens = [token.lemma_ for token in doc]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
def splitWords(l):
for words in l:
yield from words.split()
def clean_doc(doc):
# split into tokens by white space
tokens = doc.split()
# prepare regex for char filtering
#re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
#tokens = [re_punc.sub('', w) for w in tokens]
tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens]
tokens = list(splitWords(tokens))
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('italian'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 2]
return tokens
def predict_sentiment(review, vocab, tokenizer, model):
# clean
tokens = clean_doc(review)
# filter by vocab
tokens = [w for w in tokens if w in vocab]
# convert to line
line = ' '.join(tokens)
# encode
encoded = tokenizer.texts_to_matrix([line], mode='binary')
# predict sentiment
yhat = model.predict(encoded, verbose=0)
# retrieve predicted percentage and label
percent_pos = yhat[0,0]
if round(percent_pos) == 0:
return (1-percent_pos), 'NEGATIVE'
return percent_pos, 'POSITIVE'
def predict(model_path, weights_path, tokenizer_path, text, debug) :
tokenizer = utils.load_tokenizer(tokenizer_path)
vocab = load_vocab()
doc = lemma_text(text)
model = models.load_model(model_path, compile=False)
if(weights_path != '') :
model.load_weights(weights_path, skip_mismatch=True, by_name=True)
percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model)
if(debug):
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100))
return doc, sentiment, percent
def binary_classification(text):
init()
model = MODEL + 'binary-classification.h5'
weights = ''
tokenizer = TOKEN + 'binary-classification-tokenizer.json'
try:
if(text == "") : raise Exception
except :
return {"error": "Sentence is required"}, 415
doc, sentiment, percent = predict(model, weights, tokenizer, text, False)
positive = 0
negative = 0
if(sentiment == 'POSITIVE'):
positive = percent.astype(float) * 100
negative = 100 - positive
else:
negative = percent.astype(float) * 100
positive = 100 - negative
labels = {
'positive' : "%.2f" % float(positive/100),
'negative' : "%.2f" % float(negative/100)
}
return labels
|