File size: 1,296 Bytes
bdaeeeb b7d367d bdaeeeb b7d367d bdaeeeb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import hashlib
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import logging
from ..config import get_settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
_model: SentenceTransformer | None = None
INSTRUCTION = "Represent this candidate profile for matching job descriptions: "
def _get_model() -> SentenceTransformer:
global _model
if _model is None:
settings = get_settings()
_model = SentenceTransformer(settings.embedding_model)
return _model
def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray:
model = _get_model()
if instruction:
texts = [INSTRUCTION + t for t in texts]
embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
return embeddings.astype(np.float32)
def embed_query(text: str) -> np.ndarray:
model = _get_model()
query = "Represent this job description for retrieving matching candidates: " + text
emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
return emb[0].astype(np.float32)
def compute_text_hash(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:32]
|