Update pipeline.py
#1
by
alighadami77
- opened
- pipeline.py +49 -20
pipeline.py
CHANGED
|
@@ -1,36 +1,65 @@
|
|
| 1 |
# from scipy.special import softmax
|
| 2 |
import tensorflow as tf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class PreTrainedPipeline():
|
| 5 |
def __init__(self, path):
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
x = tf.keras.layers.Dense(512, activation="LeakyReLU")(x)
|
| 13 |
-
x = tf.keras.layers.Dense(1024, activation="LeakyReLU")(x)
|
| 14 |
-
x = tf.keras.layers.Dense(2048, activation="LeakyReLU")(x)
|
| 15 |
-
outputs = tf.keras.layers.Dense(300, activation="tanh")(x)
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
-
model.
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
return [
|
| 27 |
-
[
|
| 28 |
-
{'label':
|
| 29 |
-
{'label':
|
| 30 |
-
{'label':
|
| 31 |
-
{'label':
|
| 32 |
]
|
| 33 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# def RevDict(sent,flag,model):
|
| 36 |
# """
|
|
|
|
| 1 |
# from scipy.special import softmax
|
| 2 |
import tensorflow as tf
|
| 3 |
+
from transformers import Pipeline
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import numpy as np
|
| 6 |
+
import json
|
| 7 |
+
from hazm import *
|
| 8 |
+
from scipy.spatial import distance
|
| 9 |
+
|
| 10 |
|
| 11 |
class PreTrainedPipeline():
|
| 12 |
def __init__(self, path):
|
| 13 |
+
self.model_dir = "saved_model"
|
| 14 |
+
self.t2id_path = "t2id.json"
|
| 15 |
+
self.stopwords_path = "stopwords.txt"
|
| 16 |
+
self.id2h_path = "id2h.json"
|
| 17 |
+
self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
|
| 18 |
+
self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
|
| 21 |
+
self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
|
| 22 |
|
| 23 |
+
self.model = tf.saved_model.load(self.model_dir)
|
| 24 |
|
| 25 |
+
def __call__(self, inputs: str):
|
| 26 |
|
| 27 |
+
# Preprocess the input sentence
|
| 28 |
+
sentence = Normalizer().normalize(inputs)
|
| 29 |
+
tokens = word_tokenize(sentence)
|
| 30 |
+
tokens = [t for t in tokens if t not in self.stopwords]
|
| 31 |
+
input_ids = np.zeros((1, 20))
|
| 32 |
+
for i, token in enumerate(tokens):
|
| 33 |
+
if i >= 20:
|
| 34 |
+
break
|
| 35 |
+
input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
|
| 36 |
|
| 37 |
+
# Call the model on the input ids
|
| 38 |
+
embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
|
| 39 |
+
# Postprocess the embeddings to get the most similar words
|
| 40 |
+
similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
|
| 41 |
+
top_indices = similarities.argsort()[:10]
|
| 42 |
+
top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
return [
|
| 46 |
+
[
|
| 47 |
+
{'label': top_words[0], 'score': 0},
|
| 48 |
+
{'label': top_words[1], 'score': 0},
|
| 49 |
+
{'label': top_words[2], 'score': 0},
|
| 50 |
+
{'label': top_words[3], 'score': 0},
|
| 51 |
]
|
| 52 |
]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# return [
|
| 56 |
+
# [ # Sample output, call the model here TODO
|
| 57 |
+
# {'label': 'POSITIVE', 'score': 0.05},
|
| 58 |
+
# {'label': 'NEGATIVE', 'score': 0.03},
|
| 59 |
+
# {'label': 'معنی', 'score': 0.92},
|
| 60 |
+
# {'label': f'{inputs}', 'score': 0},
|
| 61 |
+
# ]
|
| 62 |
+
# ]
|
| 63 |
|
| 64 |
# def RevDict(sent,flag,model):
|
| 65 |
# """
|