visualizar-ods

Running

App Files Files Community

dfsandovalp01 commited on Feb 26

Commit

ce90801

verified ·

1 Parent(s): a9e96f6

Upload 3 files

Browse files

Files changed (3) hide show

src/embeddings/instructor_embeddings.py +26 -20
src/embeddings/mass_modelos_nlp_db.py +6 -1
src/embeddings/modelos_nlp_db.py +14 -4

src/embeddings/instructor_embeddings.py CHANGED Viewed

@@ -1,21 +1,27 @@
-# src/embeddings/instructor_embeddings.py
-from sentence_transformers import SentenceTransformer #
-import os
-from pathlib import Path
-class InstructorEmbeddings:
-    def __init__(self, model_name="hkunlp/instructor-large", cache_dir="./data/embeddings/cache"):
-        self.cache_dir = Path(cache_dir)
-        self.cache_dir.mkdir(parents=True, exist_ok=True)
-        # HF Spaces descargará automáticamente el modelo
-        self.model = SentenceTransformer(
-            model_name,
-            cache_folder=str(self.cache_dir)
-        )
-    def encode(self, texts, instruction="", **kwargs):
-        if instruction:
-            texts_with_instruction = [[instruction, text] for text in texts]
-            return self.model.encode(texts_with_instruction, **kwargs)
         return self.model.encode(texts, **kwargs)

+# src/embeddings/instructor_embeddings.py
+from sentence_transformers import SentenceTransformer #
+import os
+from pathlib import Path
+class InstructorEmbeddings:
+    def __init__(self, model_name="hkunlp/instructor-large", cache_dir="./data/embeddings/cache"):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Configurar modelo sin warning de tied weights
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_name)
+        config.tie_word_embeddings = False
+        # HF Spaces descargará automáticamente el modelo
+        self.model = SentenceTransformer(
+            model_name,
+            cache_folder=str(self.cache_dir),
+            model_kwargs={"config": config}
+        )
+    def encode(self, texts, instruction="", **kwargs):
+        if instruction:
+            texts_with_instruction = [[instruction, text] for text in texts]
+            return self.model.encode(texts_with_instruction, **kwargs)
         return self.model.encode(texts, **kwargs)

src/embeddings/mass_modelos_nlp_db.py CHANGED Viewed

@@ -133,8 +133,13 @@ def genCache(cache_name:str, tbl_input_dir:str, out_dir:str, instruction:str, ba
   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
-  model = SentenceTransformer(model_name)
   input_pairs = make_text_pairs(instruction, input_texts)
   emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
   emb_input_np = emb_input.cpu().numpy()

   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
+  from transformers import AutoConfig
+  # Cargar configuración y silenciar warning de tied weights
+  config = AutoConfig.from_pretrained(model_name)
+  config.tie_word_embeddings = False
+  model = SentenceTransformer(model_name, model_kwargs={"config": config})
   input_pairs = make_text_pairs(instruction, input_texts)
   emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
   emb_input_np = emb_input.cpu().numpy()

src/embeddings/modelos_nlp_db.py CHANGED Viewed

@@ -133,8 +133,13 @@ def genCache(cache_name:str, tbl_input_dir:str, out_dir:str, instruction:str, ba
   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
-  model = SentenceTransformer(model_name)
   input_pairs = make_text_pairs(instruction, input_texts)
   emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
   emb_input_np = emb_input.cpu().numpy()
@@ -240,11 +245,11 @@ def search(query):
   # Compute fingerprint and cache path
   # fingerprint = build_ods_fingerprint(model_name, instr_ods, ods_texts)
   # fingerprint = [build_ods_fingerprint(model_name, instr, texts[idx]) for idx, instr in enumerate(instruc_bases)]
-  fingerprint = ['e109a32969828923f9ddf6f4ad59328d','e0d3b674182b1e8ab9280544bd9e8532','07948e6beafe34049ca8a7309363eee2','9a4c52cf18e95c52566c0b657a25c44f','5a8b0dd04b865e8f1c356a64795b3b67',
                   'c0973f650cac27181b3751aa9666819b','0a475def7da8551abdd502e1d042dc00','42e4e8bfb28dc47602e662a27d8b4e76','e0338741fd4e7b08ab7f92a32e08919b']
-  ods_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_odsDescripcion_{fingerprint[0]}.npz")
   meta_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_lvlMetaOds_{fingerprint[1]}.npz")
   indicadores_cache_path = cache_path or os.path.join(out_dir, f"ods_embeddings_{fingerprint[2]}.npz")
   genero_cache_path = cache_path or os.path.join(out_dir, f"tabla_genero_{fingerprint[3]}.npz")
@@ -261,6 +266,11 @@ def search(query):
   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
   # Load / compute ODS embeddings with cache
   ods_use_cache = (not force_recompute) and os.path.exists(ods_cache_path)
@@ -298,7 +308,7 @@ def search(query):
         # emb_unfpa_np = emb_ods.cpu().numpy()
         # save_cache(cache_paths[idx], {"model_name": model_name, "instr": instruc_bases[idx], "count": len(texts[idx])}, emb_unfpa_np)
     else:
-        model = SentenceTransformer(model_name)  # still needed for project embeddings
     # Compute PATR embeddings
     patr_pairs = make_text_pairs(instruc_iniciativas[idx], patr_texts)

   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
+  from transformers import AutoConfig
+  # Cargar configuración y silenciar warning de tied weights
+  config = AutoConfig.from_pretrained(model_name)
+  config.tie_word_embeddings = False
+  model = SentenceTransformer(model_name, model_kwargs={"config": config})
   input_pairs = make_text_pairs(instruction, input_texts)
   emb_input = compute_embeddings(model, input_pairs, batch_size=batch_size, normalize=normalize)
   emb_input_np = emb_input.cpu().numpy()
   # Compute fingerprint and cache path
   # fingerprint = build_ods_fingerprint(model_name, instr_ods, ods_texts)
   # fingerprint = [build_ods_fingerprint(model_name, instr, texts[idx]) for idx, instr in enumerate(instruc_bases)]
+  fingerprint = ['53d65b93f49c3e21d40de5933bc7c1a0','e0d3b674182b1e8ab9280544bd9e8532','07948e6beafe34049ca8a7309363eee2','9a4c52cf18e95c52566c0b657a25c44f','5a8b0dd04b865e8f1c356a64795b3b67',
                   'c0973f650cac27181b3751aa9666819b','0a475def7da8551abdd502e1d042dc00','42e4e8bfb28dc47602e662a27d8b4e76','e0338741fd4e7b08ab7f92a32e08919b']
+  ods_cache_path = cache_path or os.path.join(out_dir, f"v2_tabla_lvlOds_{fingerprint[0]}.npz")
   meta_cache_path = cache_path or os.path.join(out_dir, f"v1_tabla_lvlMetaOds_{fingerprint[1]}.npz")
   indicadores_cache_path = cache_path or os.path.join(out_dir, f"ods_embeddings_{fingerprint[2]}.npz")
   genero_cache_path = cache_path or os.path.join(out_dir, f"tabla_genero_{fingerprint[3]}.npz")
   # Lazy import model to allow quick --help
   from sentence_transformers import SentenceTransformer
+  from transformers import AutoConfig
+  # Configurar para silenciar warning de tied weights
+  config = AutoConfig.from_pretrained(model_name)
+  config.tie_word_embeddings = False
   # Load / compute ODS embeddings with cache
   ods_use_cache = (not force_recompute) and os.path.exists(ods_cache_path)
         # emb_unfpa_np = emb_ods.cpu().numpy()
         # save_cache(cache_paths[idx], {"model_name": model_name, "instr": instruc_bases[idx], "count": len(texts[idx])}, emb_unfpa_np)
     else:
+        model = SentenceTransformer(model_name, model_kwargs={"config": config})  # still needed for project embeddings
     # Compute PATR embeddings
     patr_pairs = make_text_pairs(instruc_iniciativas[idx], patr_texts)