Spaces:

NurseCitizenDeveloper
/

NurseLex

Sleeping

NurseLex / build_embeddings.py

fix(ui): remove experimental webgpu tab and switch semantic cache back to stable numpy arrays

0e0459c 11 days ago

1.33 kB

	import os
	import json
	import numpy as np
	import logging
	from sentence_transformers import SentenceTransformer

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	MODEL_NAME = "i-dot-ai/all-miniLM-L6-v2-UKPGA-6k-finetune"
	CACHE_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections.json")
	EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "nursing_sections_embeddings.npy")

	def build():
	logger.info("Loading model...")
	model = SentenceTransformer(MODEL_NAME)

	logger.info("Loading sections...")
	with open(CACHE_FILE, "r", encoding="utf-8") as f:
	sections = json.load(f)

	corpus_texts = []
	for s in sections:
	leg_id = s.get("legislation_id", "")
	act_name = leg_id.split("/")[-2] if "/" in leg_id else leg_id
	content = f"Act: {act_name}. Section {s.get('number', '')}: {s.get('title', '')}. {s.get('text', '')}"
	corpus_texts.append(content)

	logger.info(f"Encoding {len(corpus_texts)} sections...")
	# Get numpy arrays instead of tensors
	embeddings = model.encode(corpus_texts, convert_to_numpy=True, show_progress_bar=True)

	logger.info("Saving numpy embeddings to file...")
	np.save(EMBEDDINGS_FILE, embeddings)
	logger.info("Done!")

	if __name__ == "__main__":
	build()