import os import json import numpy as np import logging from sentence_transformers import SentenceTransformer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune" CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json") EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.npy") def build(): logger.info("Loading model...") model = SentenceTransformer(MODEL_NAME) logger.info("Loading sections...") with open(CACHE_FILE, "r", encoding="utf-8") as f: sections = json.load(f) corpus_texts = [] for s in sections: leg_id = s.get("legislation_id", "") act_name = leg_id.split("/")[-2] if "/" in leg_id else leg_id content = f"Act: {act_name}. Section {s.get('number', '')}: {s.get('title', '')}. {s.get('text', '')}" corpus_texts.append(content) logger.info(f"Encoding {len(corpus_texts)} sections...") # Get numpy arrays instead of tensors embeddings = model.encode(corpus_texts, convert_to_numpy=True, show_progress_bar=True) logger.info("Saving numpy embeddings to file...") np.save(EMBEDDINGS_FILE, embeddings) logger.info("Done!") if __name__ == "__main__": build()