| import hashlib | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| import logging | |
| from ..config import get_settings | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) | |
| _model: SentenceTransformer | None = None | |
| INSTRUCTION = "Represent this candidate profile for matching job descriptions: " | |
| def _get_model() -> SentenceTransformer: | |
| global _model | |
| if _model is None: | |
| settings = get_settings() | |
| _model = SentenceTransformer(settings.embedding_model) | |
| return _model | |
| def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray: | |
| model = _get_model() | |
| if instruction: | |
| texts = [INSTRUCTION + t for t in texts] | |
| embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False) | |
| return embeddings.astype(np.float32) | |
| def embed_query(text: str) -> np.ndarray: | |
| model = _get_model() | |
| query = "Represent this job description for retrieving matching candidates: " + text | |
| emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False) | |
| return emb[0].astype(np.float32) | |
| def compute_text_hash(text: str) -> str: | |
| return hashlib.sha256(text.encode()).hexdigest()[:32] | |