Smutypi3
/

applai-sbert

Sentence Similarity

sentence-transformers

feature-extraction

job-description

text-embeddings-inference

Model card Files Files and versions

applai-sbert / sbert_service.py

Smutypi3's picture

Upload folder using huggingface_hub

a178dde verified about 2 months ago

history blame contribute delete

2.07 kB

	"""SBERT service — fine-tuned SentenceTransformer for JD embedding generation.

	Model: sentence-transformers/all-mpnet-base-v2 (fine-tuned)
	Location: backend/ai_models/sbert_finetuned/
	Output: 768-dim L2-normalised embeddings
	Config: max_seq_length=384, batch_size=64
	"""

	from __future__ import annotations

	from pathlib import Path

	from sentence_transformers import SentenceTransformer

	_MODEL_DIR = Path(__file__).parent.parent / "sbert_finetuned"
	_MAX_SEQ_LENGTH = 384 # matches training CONFIG

	_model: SentenceTransformer \| None = None


	def _get_model() -> SentenceTransformer:
	global _model
	if _model is None:
	if not _MODEL_DIR.exists():
	raise FileNotFoundError(
	f"SBERT model directory not found: {_MODEL_DIR}\n"
	"Upload the fine-tuned model to that path (or to HuggingFace Hub and load via hf_hub_download)."
	)
	_model = SentenceTransformer(str(_MODEL_DIR))
	_model.max_seq_length = _MAX_SEQ_LENGTH
	return _model


	def encode_texts(texts: list[str]) -> list[list[float]]:
	"""Encode a list of texts into 768-dim embeddings.

	Matches training: convert_to_tensor=True equivalent, returned as Python lists.
	Fallback texts shorter than 10 chars are replaced before this call (caller's
	responsibility — preprocess_jd already handles the fallback).
	"""
	model = _get_model()
	embeddings = model.encode(
	texts,
	convert_to_tensor=True,
	show_progress_bar=False,
	batch_size=64,
	)
	return [e.cpu().numpy().tolist() for e in embeddings]


	def encode_jd_spans(spans: dict[str, str]) -> dict[str, list[float]]:
	"""Encode all JD spans (full, education, experience, leadership).

	Args:
	spans: Output of jd_preprocessor.preprocess_jd().

	Returns:
	Same keys, each value is a 768-dim float list.
	"""
	keys = ["full", "education", "experience", "leadership"]
	texts = [spans[k] for k in keys]
	embeddings = encode_texts(texts)
	return dict(zip(keys, embeddings))