Spaces:

NextGenTech
/

ngt-ai-platform

Running

ngt-ai-platform / modules /binary_classification.py

Gaetano Parente

aggiornati componenti binary classification

e33d658 almost 2 years ago

3.17 kB

	import spacy
	import re
	import keras.models as models
	from nltk.corpus import stopwords
	import nltk
	import modules.utilities.utils as utils

	BASE_PATH = './data/'
	VOCAB = BASE_PATH + 'vocab.txt'
	MODEL = BASE_PATH + 'model/'
	WEIGHTS = BASE_PATH + 'weights/'
	TOKEN = BASE_PATH + 'tokenizer/'

	def init():
	nltk.download('stopwords')
	global nlp
	nlp = spacy.load("it_core_news_lg")

	def load_vocab():
	vocab=utils.load_doc(VOCAB)
	vocab=vocab.split()
	vocab = set(vocab)
	return vocab

	def lemma_text(text):
	doc = nlp(text)
	lemmatized_tokens = [token.lemma_ for token in doc]
	lemmatized_text = ' '.join(lemmatized_tokens)
	return lemmatized_text

	def splitWords(l):
	for words in l:
	yield from words.split()

	def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# prepare regex for char filtering
	#re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
	#tokens = [re_punc.sub('', w) for w in tokens]
	tokens = [re.sub(r'[^\w\s]', ' ', w) for w in tokens]
	tokens = list(splitWords(tokens))
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('italian'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 2]
	return tokens

	def predict_sentiment(review, vocab, tokenizer, model):
	# clean
	tokens = clean_doc(review)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	# convert to line
	line = ' '.join(tokens)
	# encode
	encoded = tokenizer.texts_to_matrix([line], mode='binary')
	# predict sentiment
	yhat = model.predict(encoded, verbose=0)
	# retrieve predicted percentage and label
	percent_pos = yhat[0,0]
	if round(percent_pos) == 0:
	return (1-percent_pos), 'NEGATIVE'
	return percent_pos, 'POSITIVE'

	def predict(model_path, weights_path, tokenizer_path, text, debug) :
	tokenizer = utils.load_tokenizer(tokenizer_path)
	vocab = load_vocab()
	doc = lemma_text(text)
	model = models.load_model(model_path, compile=False)
	if(weights_path != '') :
	model.load_weights(weights_path, skip_mismatch=True, by_name=True)
	percent, sentiment = predict_sentiment(doc, vocab, tokenizer, model)
	if(debug):
	print('Review: [%s]\nSentiment: %s (%.3f%%)' % (doc, sentiment, percent*100))
	return doc, sentiment, percent

	def binary_classification(text):
	init()
	model = MODEL + 'binary-classification.h5'
	weights = ''
	tokenizer = TOKEN + 'binary-classification-tokenizer.json'
	try:
	if(text == "") : raise Exception
	except :
	return {"error": "Sentence is required"}, 415
	doc, sentiment, percent = predict(model, weights, tokenizer, text, False)
	positive = 0
	negative = 0
	if(sentiment == 'POSITIVE'):
	positive = percent.astype(float) * 100
	negative = 100 - positive
	else:
	negative = percent.astype(float) * 100
	positive = 100 - negative
	labels = {
	'positive' : "%.2f" % float(positive/100),
	'negative' : "%.2f" % float(negative/100)
	}
	return labels