import hashlib import numpy as np from sentence_transformers import SentenceTransformer import os import logging from ..config import get_settings os.environ["TOKENIZERS_PARALLELISM"] = "false" logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) _model: SentenceTransformer | None = None INSTRUCTION = "Represent this candidate profile for matching job descriptions: " def _get_model() -> SentenceTransformer: global _model if _model is None: settings = get_settings() _model = SentenceTransformer(settings.embedding_model) return _model def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray: model = _get_model() if instruction: texts = [INSTRUCTION + t for t in texts] embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False) return embeddings.astype(np.float32) def embed_query(text: str) -> np.ndarray: model = _get_model() query = "Represent this job description for retrieving matching candidates: " + text emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False) return emb[0].astype(np.float32) def compute_text_hash(text: str) -> str: return hashlib.sha256(text.encode()).hexdigest()[:32]