File size: 1,296 Bytes
bdaeeeb
 
 
b7d367d
 
bdaeeeb
 
b7d367d
 
 
bdaeeeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import hashlib
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import logging
from ..config import get_settings

os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

_model: SentenceTransformer | None = None
INSTRUCTION = "Represent this candidate profile for matching job descriptions: "


def _get_model() -> SentenceTransformer:
    global _model
    if _model is None:
        settings = get_settings()
        _model = SentenceTransformer(settings.embedding_model)
    return _model


def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray:
    model = _get_model()
    if instruction:
        texts = [INSTRUCTION + t for t in texts]
    embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
    return embeddings.astype(np.float32)


def embed_query(text: str) -> np.ndarray:
    model = _get_model()
    query = "Represent this job description for retrieving matching candidates: " + text
    emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
    return emb[0].astype(np.float32)


def compute_text_hash(text: str) -> str:
    return hashlib.sha256(text.encode()).hexdigest()[:32]