NurseCitizenDeveloper commited on
Commit
ca709ec
·
1 Parent(s): 5a9ea42

feat: precompute semantic embeddings to drastically reduce startup time and prevent Gradio timeouts

Browse files
build_embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import logging
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
11
+ CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
12
+ EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.pt")
13
+
14
+ def build():
15
+ logger.info("Loading model...")
16
+ model = SentenceTransformer(MODEL_NAME)
17
+
18
+ logger.info("Loading sections...")
19
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
20
+ sections = json.load(f)
21
+
22
+ corpus_texts = []
23
+ for s in sections:
24
+ leg_id = s.get("legislation_id", "")
25
+ act_name = leg_id.split("/")[-2] if "/" in leg_id else leg_id
26
+ content = f"Act: {act_name}. Section {s.get('number', '')}: {s.get('title', '')}. {s.get('text', '')}"
27
+ corpus_texts.append(content)
28
+
29
+ logger.info(f"Encoding {len(corpus_texts)} sections...")
30
+ embeddings = model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=True)
31
+
32
+ logger.info("Saving embeddings to file...")
33
+ torch.save(embeddings, EMBEDDINGS_FILE)
34
+ logger.info("Done!")
35
+
36
+ if __name__ == "__main__":
37
+ build()
local_search.py CHANGED
@@ -15,6 +15,7 @@ logger = logging.getLogger(__name__)
15
  # Constants
16
  MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
17
  CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
 
18
 
19
  # Global variables to hold the model and embeddings in memory
20
  _model = None
@@ -43,6 +44,12 @@ def init_local_search():
43
  logger.warning("No sections found in cache.")
44
  return
45
 
 
 
 
 
 
 
46
  logger.info(f"Computing embeddings for {len(_sections)} cached sections. This may take a minute on first run...")
47
  # Prepare text for embedding: combine legislation title, section title, and text
48
  corpus_texts = []
@@ -57,6 +64,12 @@ def init_local_search():
57
 
58
  # Encode all sections
59
  _corpus_embeddings = _model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=False)
 
 
 
 
 
 
60
  logger.info("Local semantic search engine ready.")
61
 
62
  except Exception as e:
 
15
  # Constants
16
  MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
17
  CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
18
+ EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.pt")
19
 
20
  # Global variables to hold the model and embeddings in memory
21
  _model = None
 
44
  logger.warning("No sections found in cache.")
45
  return
46
 
47
+ if os.path.exists(EMBEDDINGS_FILE):
48
+ logger.info("Loading precomputed embeddings from disk (Instant)...")
49
+ _corpus_embeddings = torch.load(EMBEDDINGS_FILE)
50
+ logger.info("Local semantic search engine ready.")
51
+ return
52
+
53
  logger.info(f"Computing embeddings for {len(_sections)} cached sections. This may take a minute on first run...")
54
  # Prepare text for embedding: combine legislation title, section title, and text
55
  corpus_texts = []
 
64
 
65
  # Encode all sections
66
  _corpus_embeddings = _model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=False)
67
+ logger.info("Saving computed embeddings for future use...")
68
+ try:
69
+ torch.save(_corpus_embeddings, EMBEDDINGS_FILE)
70
+ except Exception as save_err:
71
+ logger.warning(f"Failed to save embeddings cache: {save_err}")
72
+
73
  logger.info("Local semantic search engine ready.")
74
 
75
  except Exception as e:
nursing_sections_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b5871be990e132910869cb92483eed48fad8390fe7e81956319db00ffbbe86
3
+ size 1734389