coderound / backend /src /ml /embedder.py
ketannnn's picture
feat: stabilize celery loops, add redis caching, and scale stage 2 neural limit to 250
b7d367d
import hashlib
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import logging
from ..config import get_settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
_model: SentenceTransformer | None = None
INSTRUCTION = "Represent this candidate profile for matching job descriptions: "
def _get_model() -> SentenceTransformer:
global _model
if _model is None:
settings = get_settings()
_model = SentenceTransformer(settings.embedding_model)
return _model
def embed_texts(texts: list[str], instruction: bool = True) -> np.ndarray:
model = _get_model()
if instruction:
texts = [INSTRUCTION + t for t in texts]
embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
return embeddings.astype(np.float32)
def embed_query(text: str) -> np.ndarray:
model = _get_model()
query = "Represent this job description for retrieving matching candidates: " + text
emb = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
return emb[0].astype(np.float32)
def compute_text_hash(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:32]