Rubens commited on
Commit
2b80e27
·
1 Parent(s): 50a738c
Files changed (1) hide show
  1. app.py +2 -98
app.py CHANGED
@@ -4,7 +4,7 @@ import tempfile
4
  from typing import Dict, Text
5
  import numpy as np
6
  import tensorflow as tf
7
- import tensorflow_recommenders as tfrs #scann 1.2.7 + recomm 0.7.0 + TF 2.8.0
8
  import os
9
  import unidecode
10
  from nltk import word_tokenize
@@ -18,105 +18,46 @@ import scann
18
 
19
 
20
  df=pd.read_csv("/home/user/app/Dubai_translated_best_2500.csv",sep=",",header=0)
21
-
22
- for i in range(0,len(df['requisito'])):
23
- print(len(df['requisito'].iloc[i]))
24
-
25
  df=df.drop_duplicates()
26
  df=df.dropna()
27
-
28
  df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
29
  df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
30
-
31
  my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
32
-
33
  my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
34
-
35
-
36
  ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
37
  "code": x["code"],
38
  "nome_vaga": x["nome_vaga"],
39
  "requisito": tf.strings.split(x["requisito"],maxsplit=106)
40
  })
41
-
42
- l=[]
43
- for x in ratings.as_numpy_iterator():
44
- pprint.pprint(len(x['requisito']))
45
- l.append(len(x['requisito']))
46
-
47
- min(l)
48
-
49
-
50
  movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
51
  "code": x["code"],
52
  "nome_vaga": x["nome_vaga"]
53
  })
54
- for x in movies.take(1).as_numpy_iterator():
55
- pprint.pprint(x)
56
-
57
  movies = movies.map(lambda x: x["code"])
58
-
59
-
60
- for x in ratings.take(5).as_numpy_iterator():
61
- pprint.pprint(x)
62
-
63
-
64
- for x in movies.take(5).as_numpy_iterator():
65
- pprint.pprint(x)
66
-
67
  ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
68
  "code": x["code"],
69
  "requisito": tf.strings.split(x["requisito"],maxsplit=106)
70
  })
71
-
72
  tf.random.set_seed(42)
73
  shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
74
  shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
75
-
76
  train = shuffled.take(int(df.shape[0]*0.9))
77
  test = shuffled.take(int(df.shape[0]*0.1))
78
  cego=shuffled2
79
-
80
- for x in train.take(1).as_numpy_iterator():
81
- pprint.pprint(x)
82
-
83
- for x in test.take(5).as_numpy_iterator():
84
- pprint.pprint(x)
85
-
86
-
87
-
88
-
89
  movie_titles = movies#.map(lambda x: x["code"])
90
  user_ids = ratings.map(lambda x: x["requisito"])
91
-
92
  xx=[]
93
  for x in user_ids.as_numpy_iterator():
94
  try:
95
- #print(x)
96
  xx.append(x)
97
  except:
98
  pass
99
-
100
-
101
-
102
  unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
103
-
104
  unique_user_ids = np.unique(np.concatenate(xx))
105
-
106
  user_ids=user_ids.batch(int(df.shape[0]*0.9))
107
-
108
  layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
109
-
110
- for x in ratings.take(1).as_numpy_iterator():
111
- pprint.pprint(x['requisito'])
112
-
113
- for x in ratings.take(5).as_numpy_iterator():
114
- pprint.pprint(np.array(layer(x['requisito'])))
115
-
116
  unique_movie_titles[:10]
117
-
118
  embedding_dimension = 768
119
-
120
  user_model = tf.keras.Sequential([
121
  tf.keras.layers.StringLookup(
122
  vocabulary=unique_user_ids, mask_token=None),
@@ -124,31 +65,18 @@ user_model = tf.keras.Sequential([
124
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
125
 
126
  ])
127
-
128
- for x in train.take(5).as_numpy_iterator():
129
- pprint.pprint(np.array(user_model(x['requisito'])).shape)
130
-
131
-
132
  movie_model = tf.keras.Sequential([
133
  tf.keras.layers.StringLookup(
134
  vocabulary=unique_movie_titles, mask_token=None),
135
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
136
  ])
137
-
138
- for x in train.take(5).as_numpy_iterator():
139
- pprint.pprint(np.array(movie_model(x['code'])).shape)
140
-
141
-
142
  metrics = tfrs.metrics.FactorizedTopK(
143
  candidates=movies.batch(df.shape[0]
144
  ).map(movie_model)
145
  )
146
-
147
  task = tfrs.tasks.Retrieval(
148
  metrics=metrics
149
  )
150
-
151
-
152
  class MovielensModel(tfrs.Model):
153
 
154
  def __init__(self, user_model, movie_model):
@@ -158,13 +86,8 @@ class MovielensModel(tfrs.Model):
158
  self.task: tf.keras.layers.Layer = task
159
 
160
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
161
- # We pick out the user features and pass them into the user model.
162
  user_embeddings = self.user_model(features["requisito"])
163
- # And pick out the movie features and pass them into the movie model,
164
- # getting embeddings back.
165
  positive_movie_embeddings = self.movie_model(features["code"])
166
-
167
- # The task computes the loss and the metrics.
168
  return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
169
 
170
  class NoBaseClassMovielensModel(tf.keras.Model):
@@ -177,15 +100,12 @@ class NoBaseClassMovielensModel(tf.keras.Model):
177
 
178
  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
179
 
180
- # Set up a gradient tape to record gradients.
181
  with tf.GradientTape() as tape:
182
 
183
- # Loss computation.
184
  user_embeddings = self.user_model(features["requisito"])
185
  positive_movie_embeddings = self.movie_model(features["code"])
186
  loss = self.task(user_embeddings, positive_movie_embeddings)
187
 
188
- # Handle regularization losses as well.
189
  regularization_loss = sum(self.losses)
190
 
191
  total_loss = loss + regularization_loss
@@ -202,12 +122,10 @@ class NoBaseClassMovielensModel(tf.keras.Model):
202
 
203
  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
204
 
205
- # Loss computation.
206
  user_embeddings = self.user_model(features["requisito"])
207
  positive_movie_embeddings = self.movie_model(features["code"])
208
  loss = self.task(user_embeddings, positive_movie_embeddings)
209
 
210
- # Handle regularization losses as well.
211
  regularization_loss = sum(self.losses)
212
 
213
  total_loss = loss + regularization_loss
@@ -222,33 +140,22 @@ class NoBaseClassMovielensModel(tf.keras.Model):
222
  model = MovielensModel(user_model, movie_model)
223
  model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
224
  cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
225
-
226
  cached_test = test.batch(int(df.shape[0]*0.1)).cache()
227
-
228
  path = os.path.join("/home/user/app/", "model/")
229
-
230
-
231
  cp_callback = tf.keras.callbacks.ModelCheckpoint(
232
  filepath=path,
233
  verbose=1,
234
  save_weights_only=True,
235
  save_freq=2)
236
 
237
-
238
  model.fit(cached_train, callbacks=[cp_callback],epochs=200)
239
 
240
-
241
-
242
  index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
243
 
244
-
245
- from sklearn.metrics.pairwise import cosine_similarity
246
-
247
  indice=[]
248
  for i in range(0,1633):
249
  indice.append(np.array(index)[i][0])
250
 
251
-
252
  searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
253
  num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
254
  2, quantize=True).build()
@@ -260,10 +167,7 @@ def predict(text):
260
  xx = df.iloc[neighbors[0],:].nome_vaga
261
  return xx
262
 
263
-
264
-
265
  demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
266
  outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
267
  css='div {margin-left: auto; margin-right: auto; width: 100%;\
268
- background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)
269
-
 
4
  from typing import Dict, Text
5
  import numpy as np
6
  import tensorflow as tf
7
+ import tensorflow_recommenders as tfrs
8
  import os
9
  import unidecode
10
  from nltk import word_tokenize
 
18
 
19
 
20
  df=pd.read_csv("/home/user/app/Dubai_translated_best_2500.csv",sep=",",header=0)
 
 
 
 
21
  df=df.drop_duplicates()
22
  df=df.dropna()
 
23
  df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
24
  df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
 
25
  my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
 
26
  my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
 
 
27
  ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
28
  "code": x["code"],
29
  "nome_vaga": x["nome_vaga"],
30
  "requisito": tf.strings.split(x["requisito"],maxsplit=106)
31
  })
 
 
 
 
 
 
 
 
 
32
  movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
33
  "code": x["code"],
34
  "nome_vaga": x["nome_vaga"]
35
  })
 
 
 
36
  movies = movies.map(lambda x: x["code"])
 
 
 
 
 
 
 
 
 
37
  ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
38
  "code": x["code"],
39
  "requisito": tf.strings.split(x["requisito"],maxsplit=106)
40
  })
 
41
  tf.random.set_seed(42)
42
  shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
43
  shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
 
44
  train = shuffled.take(int(df.shape[0]*0.9))
45
  test = shuffled.take(int(df.shape[0]*0.1))
46
  cego=shuffled2
 
 
 
 
 
 
 
 
 
 
47
  movie_titles = movies#.map(lambda x: x["code"])
48
  user_ids = ratings.map(lambda x: x["requisito"])
 
49
  xx=[]
50
  for x in user_ids.as_numpy_iterator():
51
  try:
 
52
  xx.append(x)
53
  except:
54
  pass
 
 
 
55
  unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
 
56
  unique_user_ids = np.unique(np.concatenate(xx))
 
57
  user_ids=user_ids.batch(int(df.shape[0]*0.9))
 
58
  layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
 
 
 
 
 
 
 
59
  unique_movie_titles[:10]
 
60
  embedding_dimension = 768
 
61
  user_model = tf.keras.Sequential([
62
  tf.keras.layers.StringLookup(
63
  vocabulary=unique_user_ids, mask_token=None),
 
65
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
66
 
67
  ])
 
 
 
 
 
68
  movie_model = tf.keras.Sequential([
69
  tf.keras.layers.StringLookup(
70
  vocabulary=unique_movie_titles, mask_token=None),
71
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
72
  ])
 
 
 
 
 
73
  metrics = tfrs.metrics.FactorizedTopK(
74
  candidates=movies.batch(df.shape[0]
75
  ).map(movie_model)
76
  )
 
77
  task = tfrs.tasks.Retrieval(
78
  metrics=metrics
79
  )
 
 
80
  class MovielensModel(tfrs.Model):
81
 
82
  def __init__(self, user_model, movie_model):
 
86
  self.task: tf.keras.layers.Layer = task
87
 
88
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
 
89
  user_embeddings = self.user_model(features["requisito"])
 
 
90
  positive_movie_embeddings = self.movie_model(features["code"])
 
 
91
  return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
92
 
93
  class NoBaseClassMovielensModel(tf.keras.Model):
 
100
 
101
  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
102
 
 
103
  with tf.GradientTape() as tape:
104
 
 
105
  user_embeddings = self.user_model(features["requisito"])
106
  positive_movie_embeddings = self.movie_model(features["code"])
107
  loss = self.task(user_embeddings, positive_movie_embeddings)
108
 
 
109
  regularization_loss = sum(self.losses)
110
 
111
  total_loss = loss + regularization_loss
 
122
 
123
  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
124
 
 
125
  user_embeddings = self.user_model(features["requisito"])
126
  positive_movie_embeddings = self.movie_model(features["code"])
127
  loss = self.task(user_embeddings, positive_movie_embeddings)
128
 
 
129
  regularization_loss = sum(self.losses)
130
 
131
  total_loss = loss + regularization_loss
 
140
  model = MovielensModel(user_model, movie_model)
141
  model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
142
  cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
 
143
  cached_test = test.batch(int(df.shape[0]*0.1)).cache()
 
144
  path = os.path.join("/home/user/app/", "model/")
 
 
145
  cp_callback = tf.keras.callbacks.ModelCheckpoint(
146
  filepath=path,
147
  verbose=1,
148
  save_weights_only=True,
149
  save_freq=2)
150
 
 
151
  model.fit(cached_train, callbacks=[cp_callback],epochs=200)
152
 
 
 
153
  index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
154
 
 
 
 
155
  indice=[]
156
  for i in range(0,1633):
157
  indice.append(np.array(index)[i][0])
158
 
 
159
  searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
160
  num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
161
  2, quantize=True).build()
 
167
  xx = df.iloc[neighbors[0],:].nome_vaga
168
  return xx
169
 
 
 
170
  demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
171
  outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
172
  css='div {margin-left: auto; margin-right: auto; width: 100%;\
173
+ background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)