| import tensorflow as tf |
| from transformers import Pipeline |
| import tensorflow as tf |
| import numpy as np |
| import json |
| from hazm import * |
| from scipy.spatial import distance |
|
|
|
|
| class PreTrainedPipeline(): |
| def __init__(self, path): |
| self.model_dir = path + "/saved_model" |
| self.t2id_path = path + "/t2id.json" |
| self.id2h_path = path + "/id2h.json" |
| self.stopwords_path = path + "/stopwords.txt" |
| self.comparison_matrix_path = path + "/comparison_matrix.npz" |
|
|
| self.t2id = json.load(open(self.t2id_path,encoding="utf8")) |
| self.id2h = json.load(open(self.id2h_path,encoding="utf8")) |
|
|
| self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8")) |
| self.comparisons = np.load(self.comparison_matrix_path)['arr_0'] |
|
|
| self.model = tf.saved_model.load(self.model_dir) |
|
|
| def __call__(self, inputs: str): |
|
|
| |
| sentence = Normalizer().normalize(inputs) |
| tokens = word_tokenize(sentence) |
| tokens = [t for t in tokens if t not in self.stopwords] |
| input_ids = np.zeros((1, 20)) |
| for i, token in enumerate(tokens): |
| if i >= 20: |
| break |
| input_ids[0, i] = self.t2id.get(token, self.t2id['UNK']) |
|
|
| |
| embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy() |
| |
| similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0] |
| top_indices = similarities.argsort()[:10] |
| top_words = [self.id2h[str(top_indices[i])] for i in range(10)] |
| logits = -8*np.array(similarities[top_indices]) |
| softmax_probs = tf.nn.softmax(logits).numpy() |
| top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)] |
| |
| return [ |
| [{'label': word, 'score': score} for word, score in zip(top_words, top_scores)] |
| ] |
|
|