File size: 3,173 Bytes
2ff9250
 
 
 
 
 
 
5d81264
2ff9250
 
 
 
 
 
 
 
40ba4b5
2ff9250
 
 
 
 
 
 
 
 
 
 
 
 
e33d658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ff9250
 
 
 
 
 
 
 
 
e33d658
2ff9250
 
 
 
 
 
 
 
 
d4c306a
 
 
2ff9250
 
 
 
 
 
 
 
c469f55
2ff9250
a9d51d0
2ff9250
 
 
 
 
 
 
 
 
 
db62dd1
 
2ff9250
db62dd1
 
2ff9250
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import spacy
import re
import keras.models as models
from nltk.corpus import stopwords
import nltk
import modules.utilities.utils as utils

BASE_PATH = './data/'
VOCAB = BASE_PATH + 'vocab.txt'
MODEL = BASE_PATH + 'model/'
WEIGHTS = BASE_PATH + 'weights/'
TOKEN = BASE_PATH + 'tokenizer/'

def init():
	nltk.download('stopwords')
	global nlp
	nlp = spacy.load("it_core_news_lg")

def load_vocab():
	vocab=utils.load_doc(VOCAB)
	vocab=vocab.split()
	vocab = set(vocab)
	return vocab

def lemma_text(text):
	doc = nlp(text)
	lemmatized_tokens = [token.lemma_ for token in doc]
	lemmatized_text = ' '.join(lemmatized_tokens)
	return lemmatized_text

def splitWords(l):
    for words in l:
        yield from words.split()

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    #re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
    #tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens]
    tokens = list(splitWords(tokens))
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('italian'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 2]
    return tokens

def predict_sentiment(review, vocab, tokenizer, model):
    # clean
    tokens = clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab] 
    # convert to line
    line = ' '.join(tokens)
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    # predict sentiment
    yhat = model.predict(encoded, verbose=0)
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

def predict(model_path, weights_path, tokenizer_path, text, debug) :
	tokenizer = utils.load_tokenizer(tokenizer_path)
	vocab = load_vocab()
	doc = lemma_text(text)
	model = models.load_model(model_path, compile=False)
	if(weights_path != '') :
		model.load_weights(weights_path, skip_mismatch=True, by_name=True)
	percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model)
	if(debug):
		print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100))
	return doc, sentiment, percent

def binary_classification(text):
	init()
	model = MODEL + 'binary-classification.h5'
	weights = ''
	tokenizer = TOKEN + 'binary-classification-tokenizer.json'
	try:
		if(text == "") : raise Exception
	except :
		return {"error": "Sentence is required"}, 415
	doc, sentiment, percent = predict(model, weights, tokenizer, text, False)
	positive = 0
	negative = 0
	if(sentiment == 'POSITIVE'):
		positive = percent.astype(float) * 100
		negative = 100 - positive
	else:
		negative = percent.astype(float) * 100
		positive = 100 - negative
	labels = {
        'positive' : "%.2f" % float(positive/100),
        'negative' : "%.2f" % float(negative/100)
    }    
	return labels