| # from scipy.special import softmax | |
| import tensorflow as tf | |
| from transformers import Pipeline | |
| import tensorflow as tf | |
| import numpy as np | |
| import json | |
| from hazm import * | |
| from scipy.spatial import distance | |
| class PreTrainedPipeline(): | |
| def __init__(self, path): | |
| self.model_dir = path + "/saved_model" | |
| self.t2id_path = path + "/t2id.json" | |
| self.id2h_path = path + "/id2h.json" | |
| self.stopwords_path = path + "/stopwords.txt" | |
| self.comparison_matrix_path = path + "/comparison_matrix.npz" | |
| self.t2id = json.load(open(self.t2id_path,encoding="utf8")) | |
| self.id2h = json.load(open(self.id2h_path,encoding="utf8")) | |
| self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8")) | |
| self.comparisons = np.load(self.comparison_matrix_path)['arr_0'] | |
| self.model = tf.saved_model.load(self.model_dir) | |
| def __call__(self, inputs: str): | |
| # Preprocess the input sentence | |
| sentence = Normalizer().normalize(inputs) | |
| tokens = word_tokenize(sentence) | |
| tokens = [t for t in tokens if t not in self.stopwords] | |
| input_ids = np.zeros((1, 20)) | |
| for i, token in enumerate(tokens): | |
| if i >= 20: | |
| break | |
| input_ids[0, i] = self.t2id.get(token, self.t2id['UNK']) | |
| # Call the model on the input ids | |
| embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy() | |
| # Postprocess the embeddings to get the most similar words | |
| similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0] | |
| top_indices = similarities.argsort()[:10] | |
| top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)] | |
| return [ | |
| [ | |
| {'label': top_words[0], 'score': 0}, | |
| {'label': top_words[1], 'score': 0}, | |
| {'label': top_words[2], 'score': 0}, | |
| {'label': top_words[3], 'score': 0}, | |
| ] | |
| ] | |
| # return [ | |
| # [ # Sample output, call the model here TODO | |
| # {'label': 'POSITIVE', 'score': 0.05}, | |
| # {'label': 'NEGATIVE', 'score': 0.03}, | |
| # {'label': 'معنی', 'score': 0.92}, | |
| # {'label': f'{inputs}', 'score': 0}, | |
| # ] | |
| # ] | |
| # def RevDict(sent,flag,model): | |
| # """ | |
| # This function recieves a sentence from the user, and turns back top_10 (for flag=0) or top_100 (for flag=1) predictions. | |
| # the input sentence will be normalized, and stop words will be removed | |
| # """ | |
| # normalizer = Normalizer() | |
| # X_Normalized = normalizer.normalize(sent) | |
| # X_Tokens = word_tokenize(X_Normalized) | |
| # stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(r"stopwords.txt",'r','utf-8').readlines()] | |
| # X_Tokens = [t for t in X_Tokens if t not in stopwords] | |
| # preprocessed = [' '.join(X_Tokens)][0] | |
| # sent_ids = sent2id([preprocessed]) | |
| # output=np.array((model.predict(sent_ids.reshape((1,20))).tolist()[0])) | |
| # distances=distance.cdist(output.reshape((1,300)), comparison_matrix, "cosine")[0] | |
| # min_index_100 = distances.argsort()[:100] | |
| # min_index_10 = distances.argsort()[:10] | |
| # temp=[] | |
| # if flag == 0: | |
| # for i in range(10): | |
| # temp.append(id2h[str(min_index_10[i])]) | |
| # elif flag == 1: | |
| # for i in range(100): | |
| # temp.append(id2h[str(min_index_100[i])]) | |
| # for i in range(len(temp)): | |
| # print(temp[i]) | |
| # def sent2id(sents): | |
| # sents_id=np.zeros((len(sents),20)) | |
| # for j in tqdm(range(len(sents))): | |
| # for i,word in enumerate(sents[j].split()): | |
| # try: | |
| # sents_id[j,i] = t2id[word] | |
| # except: | |
| # sents_id[j,i] = t2id['UNK'] | |
| # if i==19: | |
| # break | |
| # return sents_id | |