Spaces:

NurseCitizenDeveloper
/

NurseLex

Sleeping

NurseCitizenDeveloper commited on 12 days ago

Commit

ca709ec

1 Parent(s): 5a9ea42

feat: precompute semantic embeddings to drastically reduce startup time and prevent Gradio timeouts

Files changed (3) hide show

build_embeddings.py ADDED Viewed

+import os
+import json
+import torch
+import logging
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
+CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
+EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.pt")
+def build():
+    logger.info("Loading model...")
+    model = SentenceTransformer(MODEL_NAME)
+    logger.info("Loading sections...")
+    with open(CACHE_FILE, "r", encoding="utf-8") as f:
+        sections = json.load(f)
+    corpus_texts = []
+    for s in sections:
+        leg_id = s.get("legislation_id", "")
+        act_name = leg_id.split("/")[-2] if "/" in leg_id else leg_id
+        content = f"Act: {act_name}. Section {s.get('number', '')}: {s.get('title', '')}. {s.get('text', '')}"
+        corpus_texts.append(content)
+    logger.info(f"Encoding {len(corpus_texts)} sections...")
+    embeddings = model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=True)
+    logger.info("Saving embeddings to file...")
+    torch.save(embeddings, EMBEDDINGS_FILE)
+    logger.info("Done!")
+if __name__ == "__main__":
+    build()

local_search.py CHANGED Viewed

@@ -15,6 +15,7 @@ logger = logging.getLogger(__name__)
 # Constants
 MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
 CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
 # Global variables to hold the model and embeddings in memory
 _model = None
@@ -43,6 +44,12 @@ def init_local_search():
             logger.warning("No sections found in cache.")
             return
         logger.info(f"Computing embeddings for {len(_sections)} cached sections. This may take a minute on first run...")
         # Prepare text for embedding: combine legislation title, section title, and text
         corpus_texts = []
@@ -57,6 +64,12 @@ def init_local_search():
         # Encode all sections
         _corpus_embeddings = _model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=False)
         logger.info("Local semantic search engine ready.")
     except Exception as e:

 # Constants
 MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
 CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
+EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.pt")
 # Global variables to hold the model and embeddings in memory
 _model = None
             logger.warning("No sections found in cache.")
             return
+        if os.path.exists(EMBEDDINGS_FILE):
+            logger.info("Loading precomputed embeddings from disk (Instant)...")
+            _corpus_embeddings = torch.load(EMBEDDINGS_FILE)
+            logger.info("Local semantic search engine ready.")
+            return
         logger.info(f"Computing embeddings for {len(_sections)} cached sections. This may take a minute on first run...")
         # Prepare text for embedding: combine legislation title, section title, and text
         corpus_texts = []
         # Encode all sections
         _corpus_embeddings = _model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=False)
+        logger.info("Saving computed embeddings for future use...")
+        try:
+            torch.save(_corpus_embeddings, EMBEDDINGS_FILE)
+        except Exception as save_err:
+            logger.warning(f"Failed to save embeddings cache: {save_err}")
         logger.info("Local semantic search engine ready.")
     except Exception as e:

nursing_sections_embeddings.pt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6b5871be990e132910869cb92483eed48fad8390fe7e81956319db00ffbbe86
+size 1734389