behnamsa
/

persian-reverse-dict

Text Classification

reverse-dictionary

Model card Files Files and versions

persian-reverse-dict / pipeline.py

behnamsa's picture

Fix scores

63fceef about 3 years ago

history blame contribute delete

2.01 kB

	import tensorflow as tf
	from transformers import Pipeline
	import tensorflow as tf
	import numpy as np
	import json
	from hazm import *
	from scipy.spatial import distance


	class PreTrainedPipeline():
	def __init__(self, path):
	self.model_dir = path + "/saved_model"
	self.t2id_path = path + "/t2id.json"
	self.id2h_path = path + "/id2h.json"
	self.stopwords_path = path + "/stopwords.txt"
	self.comparison_matrix_path = path + "/comparison_matrix.npz"

	self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
	self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

	self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
	self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

	self.model = tf.saved_model.load(self.model_dir)

	def __call__(self, inputs: str):

	# Preprocess the input sentence
	sentence = Normalizer().normalize(inputs)
	tokens = word_tokenize(sentence)
	tokens = [t for t in tokens if t not in self.stopwords]
	input_ids = np.zeros((1, 20))
	for i, token in enumerate(tokens):
	if i >= 20:
	break
	input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

	# Call the model on the input ids
	embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
	# Postprocess the embeddings to get the most similar words
	similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
	top_indices = similarities.argsort()[:10]
	top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
	logits = -8*np.array(similarities[top_indices])
	softmax_probs = tf.nn.softmax(logits).numpy()
	top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]

	return [
	[{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
	]