Spaces:
Runtime error
Runtime error
runit
Browse files
app.py
CHANGED
|
@@ -4,7 +4,7 @@ import tempfile
|
|
| 4 |
from typing import Dict, Text
|
| 5 |
import numpy as np
|
| 6 |
import tensorflow as tf
|
| 7 |
-
import tensorflow_recommenders as tfrs
|
| 8 |
import os
|
| 9 |
import unidecode
|
| 10 |
from nltk import word_tokenize
|
|
@@ -18,105 +18,46 @@ import scann
|
|
| 18 |
|
| 19 |
|
| 20 |
df=pd.read_csv("/home/user/app/Dubai_translated_best_2500.csv",sep=",",header=0)
|
| 21 |
-
|
| 22 |
-
for i in range(0,len(df['requisito'])):
|
| 23 |
-
print(len(df['requisito'].iloc[i]))
|
| 24 |
-
|
| 25 |
df=df.drop_duplicates()
|
| 26 |
df=df.dropna()
|
| 27 |
-
|
| 28 |
df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
|
| 29 |
df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
|
| 30 |
-
|
| 31 |
my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
|
| 32 |
-
|
| 33 |
my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
|
| 34 |
-
|
| 35 |
-
|
| 36 |
ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
|
| 37 |
"code": x["code"],
|
| 38 |
"nome_vaga": x["nome_vaga"],
|
| 39 |
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
| 40 |
})
|
| 41 |
-
|
| 42 |
-
l=[]
|
| 43 |
-
for x in ratings.as_numpy_iterator():
|
| 44 |
-
pprint.pprint(len(x['requisito']))
|
| 45 |
-
l.append(len(x['requisito']))
|
| 46 |
-
|
| 47 |
-
min(l)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
|
| 51 |
"code": x["code"],
|
| 52 |
"nome_vaga": x["nome_vaga"]
|
| 53 |
})
|
| 54 |
-
for x in movies.take(1).as_numpy_iterator():
|
| 55 |
-
pprint.pprint(x)
|
| 56 |
-
|
| 57 |
movies = movies.map(lambda x: x["code"])
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
for x in ratings.take(5).as_numpy_iterator():
|
| 61 |
-
pprint.pprint(x)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
for x in movies.take(5).as_numpy_iterator():
|
| 65 |
-
pprint.pprint(x)
|
| 66 |
-
|
| 67 |
ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
|
| 68 |
"code": x["code"],
|
| 69 |
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
| 70 |
})
|
| 71 |
-
|
| 72 |
tf.random.set_seed(42)
|
| 73 |
shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
|
| 74 |
shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
|
| 75 |
-
|
| 76 |
train = shuffled.take(int(df.shape[0]*0.9))
|
| 77 |
test = shuffled.take(int(df.shape[0]*0.1))
|
| 78 |
cego=shuffled2
|
| 79 |
-
|
| 80 |
-
for x in train.take(1).as_numpy_iterator():
|
| 81 |
-
pprint.pprint(x)
|
| 82 |
-
|
| 83 |
-
for x in test.take(5).as_numpy_iterator():
|
| 84 |
-
pprint.pprint(x)
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
movie_titles = movies#.map(lambda x: x["code"])
|
| 90 |
user_ids = ratings.map(lambda x: x["requisito"])
|
| 91 |
-
|
| 92 |
xx=[]
|
| 93 |
for x in user_ids.as_numpy_iterator():
|
| 94 |
try:
|
| 95 |
-
#print(x)
|
| 96 |
xx.append(x)
|
| 97 |
except:
|
| 98 |
pass
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
|
| 103 |
-
|
| 104 |
unique_user_ids = np.unique(np.concatenate(xx))
|
| 105 |
-
|
| 106 |
user_ids=user_ids.batch(int(df.shape[0]*0.9))
|
| 107 |
-
|
| 108 |
layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
|
| 109 |
-
|
| 110 |
-
for x in ratings.take(1).as_numpy_iterator():
|
| 111 |
-
pprint.pprint(x['requisito'])
|
| 112 |
-
|
| 113 |
-
for x in ratings.take(5).as_numpy_iterator():
|
| 114 |
-
pprint.pprint(np.array(layer(x['requisito'])))
|
| 115 |
-
|
| 116 |
unique_movie_titles[:10]
|
| 117 |
-
|
| 118 |
embedding_dimension = 768
|
| 119 |
-
|
| 120 |
user_model = tf.keras.Sequential([
|
| 121 |
tf.keras.layers.StringLookup(
|
| 122 |
vocabulary=unique_user_ids, mask_token=None),
|
|
@@ -124,31 +65,18 @@ user_model = tf.keras.Sequential([
|
|
| 124 |
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
|
| 125 |
|
| 126 |
])
|
| 127 |
-
|
| 128 |
-
for x in train.take(5).as_numpy_iterator():
|
| 129 |
-
pprint.pprint(np.array(user_model(x['requisito'])).shape)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
movie_model = tf.keras.Sequential([
|
| 133 |
tf.keras.layers.StringLookup(
|
| 134 |
vocabulary=unique_movie_titles, mask_token=None),
|
| 135 |
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
|
| 136 |
])
|
| 137 |
-
|
| 138 |
-
for x in train.take(5).as_numpy_iterator():
|
| 139 |
-
pprint.pprint(np.array(movie_model(x['code'])).shape)
|
| 140 |
-
|
| 141 |
-
|
| 142 |
metrics = tfrs.metrics.FactorizedTopK(
|
| 143 |
candidates=movies.batch(df.shape[0]
|
| 144 |
).map(movie_model)
|
| 145 |
)
|
| 146 |
-
|
| 147 |
task = tfrs.tasks.Retrieval(
|
| 148 |
metrics=metrics
|
| 149 |
)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
class MovielensModel(tfrs.Model):
|
| 153 |
|
| 154 |
def __init__(self, user_model, movie_model):
|
|
@@ -158,13 +86,8 @@ class MovielensModel(tfrs.Model):
|
|
| 158 |
self.task: tf.keras.layers.Layer = task
|
| 159 |
|
| 160 |
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
|
| 161 |
-
# We pick out the user features and pass them into the user model.
|
| 162 |
user_embeddings = self.user_model(features["requisito"])
|
| 163 |
-
# And pick out the movie features and pass them into the movie model,
|
| 164 |
-
# getting embeddings back.
|
| 165 |
positive_movie_embeddings = self.movie_model(features["code"])
|
| 166 |
-
|
| 167 |
-
# The task computes the loss and the metrics.
|
| 168 |
return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
|
| 169 |
|
| 170 |
class NoBaseClassMovielensModel(tf.keras.Model):
|
|
@@ -177,15 +100,12 @@ class NoBaseClassMovielensModel(tf.keras.Model):
|
|
| 177 |
|
| 178 |
def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
| 179 |
|
| 180 |
-
# Set up a gradient tape to record gradients.
|
| 181 |
with tf.GradientTape() as tape:
|
| 182 |
|
| 183 |
-
# Loss computation.
|
| 184 |
user_embeddings = self.user_model(features["requisito"])
|
| 185 |
positive_movie_embeddings = self.movie_model(features["code"])
|
| 186 |
loss = self.task(user_embeddings, positive_movie_embeddings)
|
| 187 |
|
| 188 |
-
# Handle regularization losses as well.
|
| 189 |
regularization_loss = sum(self.losses)
|
| 190 |
|
| 191 |
total_loss = loss + regularization_loss
|
|
@@ -202,12 +122,10 @@ class NoBaseClassMovielensModel(tf.keras.Model):
|
|
| 202 |
|
| 203 |
def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
| 204 |
|
| 205 |
-
# Loss computation.
|
| 206 |
user_embeddings = self.user_model(features["requisito"])
|
| 207 |
positive_movie_embeddings = self.movie_model(features["code"])
|
| 208 |
loss = self.task(user_embeddings, positive_movie_embeddings)
|
| 209 |
|
| 210 |
-
# Handle regularization losses as well.
|
| 211 |
regularization_loss = sum(self.losses)
|
| 212 |
|
| 213 |
total_loss = loss + regularization_loss
|
|
@@ -222,33 +140,22 @@ class NoBaseClassMovielensModel(tf.keras.Model):
|
|
| 222 |
model = MovielensModel(user_model, movie_model)
|
| 223 |
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
|
| 224 |
cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
|
| 225 |
-
|
| 226 |
cached_test = test.batch(int(df.shape[0]*0.1)).cache()
|
| 227 |
-
|
| 228 |
path = os.path.join("/home/user/app/", "model/")
|
| 229 |
-
|
| 230 |
-
|
| 231 |
cp_callback = tf.keras.callbacks.ModelCheckpoint(
|
| 232 |
filepath=path,
|
| 233 |
verbose=1,
|
| 234 |
save_weights_only=True,
|
| 235 |
save_freq=2)
|
| 236 |
|
| 237 |
-
|
| 238 |
model.fit(cached_train, callbacks=[cp_callback],epochs=200)
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
|
| 243 |
|
| 244 |
-
|
| 245 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 246 |
-
|
| 247 |
indice=[]
|
| 248 |
for i in range(0,1633):
|
| 249 |
indice.append(np.array(index)[i][0])
|
| 250 |
|
| 251 |
-
|
| 252 |
searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
|
| 253 |
num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
|
| 254 |
2, quantize=True).build()
|
|
@@ -260,10 +167,7 @@ def predict(text):
|
|
| 260 |
xx = df.iloc[neighbors[0],:].nome_vaga
|
| 261 |
return xx
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
| 265 |
demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
|
| 266 |
outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
|
| 267 |
css='div {margin-left: auto; margin-right: auto; width: 100%;\
|
| 268 |
-
background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)
|
| 269 |
-
|
|
|
|
| 4 |
from typing import Dict, Text
|
| 5 |
import numpy as np
|
| 6 |
import tensorflow as tf
|
| 7 |
+
import tensorflow_recommenders as tfrs
|
| 8 |
import os
|
| 9 |
import unidecode
|
| 10 |
from nltk import word_tokenize
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
df=pd.read_csv("/home/user/app/Dubai_translated_best_2500.csv",sep=",",header=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
df=df.drop_duplicates()
|
| 22 |
df=df.dropna()
|
|
|
|
| 23 |
df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
|
| 24 |
df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
|
|
|
|
| 25 |
my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
|
|
|
|
| 26 |
my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
|
|
|
|
|
|
|
| 27 |
ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
|
| 28 |
"code": x["code"],
|
| 29 |
"nome_vaga": x["nome_vaga"],
|
| 30 |
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
| 31 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
|
| 33 |
"code": x["code"],
|
| 34 |
"nome_vaga": x["nome_vaga"]
|
| 35 |
})
|
|
|
|
|
|
|
|
|
|
| 36 |
movies = movies.map(lambda x: x["code"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
|
| 38 |
"code": x["code"],
|
| 39 |
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
| 40 |
})
|
|
|
|
| 41 |
tf.random.set_seed(42)
|
| 42 |
shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
|
| 43 |
shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
|
|
|
|
| 44 |
train = shuffled.take(int(df.shape[0]*0.9))
|
| 45 |
test = shuffled.take(int(df.shape[0]*0.1))
|
| 46 |
cego=shuffled2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
movie_titles = movies#.map(lambda x: x["code"])
|
| 48 |
user_ids = ratings.map(lambda x: x["requisito"])
|
|
|
|
| 49 |
xx=[]
|
| 50 |
for x in user_ids.as_numpy_iterator():
|
| 51 |
try:
|
|
|
|
| 52 |
xx.append(x)
|
| 53 |
except:
|
| 54 |
pass
|
|
|
|
|
|
|
|
|
|
| 55 |
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
|
|
|
|
| 56 |
unique_user_ids = np.unique(np.concatenate(xx))
|
|
|
|
| 57 |
user_ids=user_ids.batch(int(df.shape[0]*0.9))
|
|
|
|
| 58 |
layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
unique_movie_titles[:10]
|
|
|
|
| 60 |
embedding_dimension = 768
|
|
|
|
| 61 |
user_model = tf.keras.Sequential([
|
| 62 |
tf.keras.layers.StringLookup(
|
| 63 |
vocabulary=unique_user_ids, mask_token=None),
|
|
|
|
| 65 |
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
|
| 66 |
|
| 67 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
movie_model = tf.keras.Sequential([
|
| 69 |
tf.keras.layers.StringLookup(
|
| 70 |
vocabulary=unique_movie_titles, mask_token=None),
|
| 71 |
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
|
| 72 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
metrics = tfrs.metrics.FactorizedTopK(
|
| 74 |
candidates=movies.batch(df.shape[0]
|
| 75 |
).map(movie_model)
|
| 76 |
)
|
|
|
|
| 77 |
task = tfrs.tasks.Retrieval(
|
| 78 |
metrics=metrics
|
| 79 |
)
|
|
|
|
|
|
|
| 80 |
class MovielensModel(tfrs.Model):
|
| 81 |
|
| 82 |
def __init__(self, user_model, movie_model):
|
|
|
|
| 86 |
self.task: tf.keras.layers.Layer = task
|
| 87 |
|
| 88 |
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
|
|
|
|
| 89 |
user_embeddings = self.user_model(features["requisito"])
|
|
|
|
|
|
|
| 90 |
positive_movie_embeddings = self.movie_model(features["code"])
|
|
|
|
|
|
|
| 91 |
return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
|
| 92 |
|
| 93 |
class NoBaseClassMovielensModel(tf.keras.Model):
|
|
|
|
| 100 |
|
| 101 |
def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
| 102 |
|
|
|
|
| 103 |
with tf.GradientTape() as tape:
|
| 104 |
|
|
|
|
| 105 |
user_embeddings = self.user_model(features["requisito"])
|
| 106 |
positive_movie_embeddings = self.movie_model(features["code"])
|
| 107 |
loss = self.task(user_embeddings, positive_movie_embeddings)
|
| 108 |
|
|
|
|
| 109 |
regularization_loss = sum(self.losses)
|
| 110 |
|
| 111 |
total_loss = loss + regularization_loss
|
|
|
|
| 122 |
|
| 123 |
def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
| 124 |
|
|
|
|
| 125 |
user_embeddings = self.user_model(features["requisito"])
|
| 126 |
positive_movie_embeddings = self.movie_model(features["code"])
|
| 127 |
loss = self.task(user_embeddings, positive_movie_embeddings)
|
| 128 |
|
|
|
|
| 129 |
regularization_loss = sum(self.losses)
|
| 130 |
|
| 131 |
total_loss = loss + regularization_loss
|
|
|
|
| 140 |
model = MovielensModel(user_model, movie_model)
|
| 141 |
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
|
| 142 |
cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
|
|
|
|
| 143 |
cached_test = test.batch(int(df.shape[0]*0.1)).cache()
|
|
|
|
| 144 |
path = os.path.join("/home/user/app/", "model/")
|
|
|
|
|
|
|
| 145 |
cp_callback = tf.keras.callbacks.ModelCheckpoint(
|
| 146 |
filepath=path,
|
| 147 |
verbose=1,
|
| 148 |
save_weights_only=True,
|
| 149 |
save_freq=2)
|
| 150 |
|
|
|
|
| 151 |
model.fit(cached_train, callbacks=[cp_callback],epochs=200)
|
| 152 |
|
|
|
|
|
|
|
| 153 |
index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
|
| 154 |
|
|
|
|
|
|
|
|
|
|
| 155 |
indice=[]
|
| 156 |
for i in range(0,1633):
|
| 157 |
indice.append(np.array(index)[i][0])
|
| 158 |
|
|
|
|
| 159 |
searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
|
| 160 |
num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
|
| 161 |
2, quantize=True).build()
|
|
|
|
| 167 |
xx = df.iloc[neighbors[0],:].nome_vaga
|
| 168 |
return xx
|
| 169 |
|
|
|
|
|
|
|
| 170 |
demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
|
| 171 |
outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
|
| 172 |
css='div {margin-left: auto; margin-right: auto; width: 100%;\
|
| 173 |
+
background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)
|
|
|