dfsandovalp01 commited on
Commit
ce90801
·
verified ·
1 Parent(s): a9e96f6

Upload 3 files

Browse files
src/embeddings/instructor_embeddings.py CHANGED
@@ -1,21 +1,27 @@
1
- # src/embeddings/instructor_embeddings.py
2
- from sentence_transformers import SentenceTransformer #
3
- import os
4
- from pathlib import Path
5
-
6
- class InstructorEmbeddings:
7
- def __init__(self, model_name="hkunlp/instructor-large", cache_dir="./data/embeddings/cache"):
8
- self.cache_dir = Path(cache_dir)
9
- self.cache_dir.mkdir(parents=True, exist_ok=True)
10
-
11
- # HF Spaces descargará automáticamente el modelo
12
- self.model = SentenceTransformer(
13
- model_name,
14
- cache_folder=str(self.cache_dir)
15
- )
16
-
17
- def encode(self, texts, instruction="", **kwargs):
18
- if instruction:
19
- texts_with_instruction = [[instruction, text] for text in texts]
20
- return self.model.encode(texts_with_instruction, **kwargs)
 
 
 
 
 
 
21
  return self.model.encode(texts, **kwargs)
 
1
+ # src/embeddings/instructor_embeddings.py
2
+ from sentence_transformers import SentenceTransformer #
3
+ import os
4
+ from pathlib import Path
5
+
6
+ class InstructorEmbeddings:
7
+ def __init__(self, model_name="hkunlp/instructor-large", cache_dir="./data/embeddings/cache"):
8
+ self.cache_dir = Path(cache_dir)
9
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
10
+
11
+ # Configurar modelo sin warning de tied weights
12
+ from transformers import AutoConfig
13
+ config = AutoConfig.from_pretrained(model_name)
14
+ config.tie_word_embeddings = False
15
+
16
+ # HF Spaces descargará automáticamente el modelo
17
+ self.model = SentenceTransformer(
18
+ model_name,
19
+ cache_folder=str(self.cache_dir),
20
+ model_kwargs={"config": config}
21
+ )
22
+
23
+ def encode(self, texts, instruction="", **kwargs):
24
+ if instruction:
25
+ texts_with_instruction = [[instruction, text] for text in texts]
26
+ return self.model.encode(texts_with_instruction, **kwargs)
27
  return self.model.encode(texts, **kwargs)
src/embeddings/mass_modelos_nlp_db.py CHANGED
@@ -133,8 +133,13 @@ def genCache(cache_name:str, tbl_input_dir:str, out_dir:str, instruction:str, ba
133
 
134
  # Lazy import model to allow quick --help
135
  from sentence_transformers import SentenceTransformer
 
136
 
137
- model = SentenceTransformer(model_name)
 
 
 
 
138
  input_pairs = make_text_pairs(instruction, input_texts)
139
  emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
140
  emb_input_np = emb_input.cpu().numpy()
 
133
 
134
  # Lazy import model to allow quick --help
135
  from sentence_transformers import SentenceTransformer
136
+ from transformers import AutoConfig
137
 
138
+ # Cargar configuración y silenciar warning de tied weights
139
+ config = AutoConfig.from_pretrained(model_name)
140
+ config.tie_word_embeddings = False
141
+
142
+ model = SentenceTransformer(model_name, model_kwargs={"config": config})
143
  input_pairs = make_text_pairs(instruction, input_texts)
144
  emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
145
  emb_input_np = emb_input.cpu().numpy()
src/embeddings/modelos_nlp_db.py CHANGED
@@ -133,8 +133,13 @@ def genCache(cache_name:str, tbl_input_dir:str, out_dir:str, instruction:str, ba
133
 
134
  # Lazy import model to allow quick --help
135
  from sentence_transformers import SentenceTransformer
 
136
 
137
- model = SentenceTransformer(model_name)
 
 
 
 
138
  input_pairs = make_text_pairs(instruction, input_texts)
139
  emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
140
  emb_input_np = emb_input.cpu().numpy()
@@ -240,11 +245,11 @@ def search(query):
240
  # Compute fingerprint and cache path
241
  # fingerprint = build_ods_fingerprint(model_name, instr_ods, ods_texts)
242
  # fingerprint = [build_ods_fingerprint(model_name, instr, texts[idx]) for idx, instr in enumerate(instruc_bases)]
243
- fingerprint = ['e109a32969828923f9ddf6f4ad59328d','e0d3b674182b1e8ab9280544bd9e8532','07948e6beafe34049ca8a7309363eee2','9a4c52cf18e95c52566c0b657a25c44f','5a8b0dd04b865e8f1c356a64795b3b67',
244
  'c0973f650cac27181b3751aa9666819b','0a475def7da8551abdd502e1d042dc00','42e4e8bfb28dc47602e662a27d8b4e76','e0338741fd4e7b08ab7f92a32e08919b']
245
 
246
 
247
- ods_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_odsDescripcion_{fingerprint[0]}.npz")
248
  meta_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_lvlMetaOds_{fingerprint[1]}.npz")
249
  indicadores_cache_path = cache_path or os.path.join(out_dir, f"ods_embeddings_{fingerprint[2]}.npz")
250
  genero_cache_path = cache_path or os.path.join(out_dir, f"tabla_genero_{fingerprint[3]}.npz")
@@ -261,6 +266,11 @@ def search(query):
261
 
262
  # Lazy import model to allow quick --help
263
  from sentence_transformers import SentenceTransformer
 
 
 
 
 
264
 
265
  # Load / compute ODS embeddings with cache
266
  ods_use_cache = (not force_recompute) and os.path.exists(ods_cache_path)
@@ -298,7 +308,7 @@ def search(query):
298
  # emb_unfpa_np = emb_ods.cpu().numpy()
299
  # save_cache(cache_paths[idx], {"model_name": model_name, "instr": instruc_bases[idx], "count": len(texts[idx])}, emb_unfpa_np)
300
  else:
301
- model = SentenceTransformer(model_name) # still needed for project embeddings
302
 
303
  # Compute PATR embeddings
304
  patr_pairs = make_text_pairs(instruc_iniciativas[idx], patr_texts)
 
133
 
134
  # Lazy import model to allow quick --help
135
  from sentence_transformers import SentenceTransformer
136
+ from transformers import AutoConfig
137
 
138
+ # Cargar configuración y silenciar warning de tied weights
139
+ config = AutoConfig.from_pretrained(model_name)
140
+ config.tie_word_embeddings = False
141
+
142
+ model = SentenceTransformer(model_name, model_kwargs={"config": config})
143
  input_pairs = make_text_pairs(instruction, input_texts)
144
  emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
145
  emb_input_np = emb_input.cpu().numpy()
 
245
  # Compute fingerprint and cache path
246
  # fingerprint = build_ods_fingerprint(model_name, instr_ods, ods_texts)
247
  # fingerprint = [build_ods_fingerprint(model_name, instr, texts[idx]) for idx, instr in enumerate(instruc_bases)]
248
+ fingerprint = ['53d65b93f49c3e21d40de5933bc7c1a0','e0d3b674182b1e8ab9280544bd9e8532','07948e6beafe34049ca8a7309363eee2','9a4c52cf18e95c52566c0b657a25c44f','5a8b0dd04b865e8f1c356a64795b3b67',
249
  'c0973f650cac27181b3751aa9666819b','0a475def7da8551abdd502e1d042dc00','42e4e8bfb28dc47602e662a27d8b4e76','e0338741fd4e7b08ab7f92a32e08919b']
250
 
251
 
252
+ ods_cache_path = cache_path or os.path.join(out_dir, f"v2_tabla_lvlOds_{fingerprint[0]}.npz")
253
  meta_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_lvlMetaOds_{fingerprint[1]}.npz")
254
  indicadores_cache_path = cache_path or os.path.join(out_dir, f"ods_embeddings_{fingerprint[2]}.npz")
255
  genero_cache_path = cache_path or os.path.join(out_dir, f"tabla_genero_{fingerprint[3]}.npz")
 
266
 
267
  # Lazy import model to allow quick --help
268
  from sentence_transformers import SentenceTransformer
269
+ from transformers import AutoConfig
270
+
271
+ # Configurar para silenciar warning de tied weights
272
+ config = AutoConfig.from_pretrained(model_name)
273
+ config.tie_word_embeddings = False
274
 
275
  # Load / compute ODS embeddings with cache
276
  ods_use_cache = (not force_recompute) and os.path.exists(ods_cache_path)
 
308
  # emb_unfpa_np = emb_ods.cpu().numpy()
309
  # save_cache(cache_paths[idx], {"model_name": model_name, "instr": instruc_bases[idx], "count": len(texts[idx])}, emb_unfpa_np)
310
  else:
311
+ model = SentenceTransformer(model_name, model_kwargs={"config": config}) # still needed for project embeddings
312
 
313
  # Compute PATR embeddings
314
  patr_pairs = make_text_pairs(instruc_iniciativas[idx], patr_texts)