| """SBERT service — fine-tuned SentenceTransformer for JD embedding generation. |
| |
| Model: sentence-transformers/all-mpnet-base-v2 (fine-tuned) |
| Location: backend/ai_models/sbert_finetuned/ |
| Output: 768-dim L2-normalised embeddings |
| Config: max_seq_length=384, batch_size=64 |
| """ |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| from sentence_transformers import SentenceTransformer |
|
|
| _MODEL_DIR = Path(__file__).parent.parent / "sbert_finetuned" |
| _MAX_SEQ_LENGTH = 384 |
|
|
| _model: SentenceTransformer | None = None |
|
|
|
|
| def _get_model() -> SentenceTransformer: |
| global _model |
| if _model is None: |
| if not _MODEL_DIR.exists(): |
| raise FileNotFoundError( |
| f"SBERT model directory not found: {_MODEL_DIR}\n" |
| "Upload the fine-tuned model to that path (or to HuggingFace Hub and load via hf_hub_download)." |
| ) |
| _model = SentenceTransformer(str(_MODEL_DIR)) |
| _model.max_seq_length = _MAX_SEQ_LENGTH |
| return _model |
|
|
|
|
| def encode_texts(texts: list[str]) -> list[list[float]]: |
| """Encode a list of texts into 768-dim embeddings. |
| |
| Matches training: convert_to_tensor=True equivalent, returned as Python lists. |
| Fallback texts shorter than 10 chars are replaced before this call (caller's |
| responsibility — preprocess_jd already handles the fallback). |
| """ |
| model = _get_model() |
| embeddings = model.encode( |
| texts, |
| convert_to_tensor=True, |
| show_progress_bar=False, |
| batch_size=64, |
| ) |
| return [e.cpu().numpy().tolist() for e in embeddings] |
|
|
|
|
| def encode_jd_spans(spans: dict[str, str]) -> dict[str, list[float]]: |
| """Encode all JD spans (full, education, experience, leadership). |
| |
| Args: |
| spans: Output of jd_preprocessor.preprocess_jd(). |
| |
| Returns: |
| Same keys, each value is a 768-dim float list. |
| """ |
| keys = ["full", "education", "experience", "leadership"] |
| texts = [spans[k] for k in keys] |
| embeddings = encode_texts(texts) |
| return dict(zip(keys, embeddings)) |
|
|