Upload folder using huggingface_hub

d9517d2 verified 2 months ago

11.5 kB

	"""LayoutLM service — LayoutLMv3-based resume encoder.

	Architecture (exact match to layoutlmv3_final.ipynb training):
	- Backbone: microsoft/layoutlmv3-base (HuggingFace)
	- Pooling: mean pooling over last hidden state with attention mask
	- Normalize: L2 normalisation after pooling
	- Output: 768-dim embeddings

	Weights: backend/ai_models/layoutlmv3_best_model.pt

	Inference approach — mirrors extract_feature_indices_by_label from training:
	1. Tokenize the FULL resume (all words + real per-word bboxes) once.
	2. Propagate word-level section labels (0=O, 1=EDU, 2=EXP, 3=LEAD)
	to subword tokens via the Fast tokenizer's word_ids().
	3. For each feature span, extract the subsequence of tokens whose label
	matches — identical to how training built per-label embedding sets.
	4. Encode each subsequence through LayoutLMEncoder (pixel_values=None).

	Fallback (no tokens for a label): single zero token with attention_mask=1,
	matching the training notebook's extract_feature_indices_by_label fallback.
	"""

	from __future__ import annotations

	from pathlib import Path

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import LayoutLMv3Model, LayoutLMv3TokenizerFast

	from ai_models.preprocessing.resume_preprocessor import (
	LABEL_EDUCATION,
	LABEL_EXPERIENCE,
	LABEL_LEADERSHIP,
	)

	_WEIGHTS_PATH = Path(__file__).parent.parent / "layoutlm" / "layoutlmv3_best_model.pt"
	_MODEL_NAME = "microsoft/layoutlmv3-base"
	_MAX_SEQ_LENGTH = 512 # matches training CONFIG['max_seq_length']
	_NORMALIZE = True # matches training CONFIG['normalize_embeddings']

	# Feature extraction order matches training feature_collections keys
	_LABEL_TO_SPAN: list[tuple[int, str]] = [
	(LABEL_EDUCATION, "education"),
	(LABEL_EXPERIENCE, "experience"),
	(LABEL_LEADERSHIP, "leadership"),
	]

	_encoder: LayoutLMEncoder \| None = None
	_tokenizer: LayoutLMv3TokenizerFast \| None = None
	_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	# ---------------------------------------------------------------------------
	# Model definition — exact copy from layoutlmv3_final.ipynb
	# ---------------------------------------------------------------------------

	class LayoutLMEncoder(nn.Module):
	"""LayoutLMv3 backbone + mean-pooling head."""

	def __init__(self, model_name: str = _MODEL_NAME, pooling: str = "mean") -> None:
	super().__init__()
	self.backbone = LayoutLMv3Model.from_pretrained(model_name)
	self.pooling = pooling

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	bbox: torch.Tensor,
	pixel_values: torch.Tensor \| None = None,
	) -> torch.Tensor:
	if pixel_values is not None:
	outputs = self.backbone(
	input_ids=input_ids,
	attention_mask=attention_mask,
	bbox=bbox,
	pixel_values=pixel_values,
	output_hidden_states=True,
	)
	else:
	outputs = self.backbone(
	input_ids=input_ids,
	attention_mask=attention_mask,
	bbox=bbox,
	output_hidden_states=True,
	)

	if self.pooling == "cls":
	embeddings = outputs.last_hidden_state[:, 0, :]
	else: # mean pooling
	last_hidden = outputs.last_hidden_state # [B, seq_len, 768]
	seq_len = last_hidden.shape[1]

	# Align attention mask to actual model output length
	if attention_mask.shape[1] != seq_len:
	if attention_mask.shape[1] > seq_len:
	attention_mask = attention_mask[:, :seq_len]
	else:
	pad_len = seq_len - attention_mask.shape[1]
	attention_mask = torch.cat(
	[
	attention_mask,
	torch.zeros(
	attention_mask.shape[0],
	pad_len,
	dtype=attention_mask.dtype,
	device=attention_mask.device,
	),
	],
	dim=1,
	)

	mask_expanded = attention_mask.unsqueeze(-1).float() # [B, seq_len, 1]
	embeddings = (
	(last_hidden * mask_expanded).sum(dim=1)
	/ mask_expanded.sum(dim=1).clamp(min=1e-9)
	)

	if _NORMALIZE:
	embeddings = F.normalize(embeddings, dim=-1)

	return embeddings


	# ---------------------------------------------------------------------------
	# Lazy model loader
	# ---------------------------------------------------------------------------

	def _get_model() -> tuple[LayoutLMEncoder, LayoutLMv3TokenizerFast]:
	global _encoder, _tokenizer

	if _encoder is None:
	if not _WEIGHTS_PATH.exists():
	raise FileNotFoundError(
	f"LayoutLMv3 weights not found: {_WEIGHTS_PATH}\n"
	"Upload the file to that path (or to HuggingFace Hub and load via hf_hub_download)."
	)
	_encoder = LayoutLMEncoder(_MODEL_NAME, pooling="mean").to(_device)
	state = torch.load(str(_WEIGHTS_PATH), map_location=_device, weights_only=True)
	if isinstance(state, dict) and "model_state_dict" in state:
	state = state["model_state_dict"]
	_encoder.load_state_dict(state, strict=False)
	_encoder.eval()

	if _tokenizer is None:
	_tokenizer = LayoutLMv3TokenizerFast.from_pretrained(_MODEL_NAME)

	return _encoder, _tokenizer


	# ---------------------------------------------------------------------------
	# Token-level feature extraction — mirrors extract_feature_indices_by_label
	# ---------------------------------------------------------------------------

	def _tokenize_and_extract_by_label(
	all_words: list[dict],
	word_labels: list[int],
	tokenizer: LayoutLMv3TokenizerFast,
	) -> tuple[dict, dict[str, dict]]:
	"""Tokenize the full resume once, then extract per-label token subsequences.

	Mirrors the training notebook pipeline:
	1. Tokenize all words with their real bboxes.
	2. Propagate word-level label IDs to subword tokens via word_ids()
	(special tokens [CLS]/[SEP]/padding receive label -100).
	3. For each label ID, collect matching token indices and slice
	input_ids / attention_mask / bbox — exactly as
	extract_feature_indices_by_label does in training.

	Fallback when no tokens match a label:
	input_ids: zeros(1) — same as training
	attention_mask: ones(1) — same as training (not zeros!)
	bbox: zeros(1,4) — same as training

	Args:
	all_words: output of extract_words_from_pdf — [{text, bbox}, ...]
	word_labels: output of assign_word_labels — one int per word

	Returns:
	(full_features, per_label_features)
	full_features: dict with input_ids/attention_mask/bbox for all tokens
	per_label_features: dict mapping span name → feature dict
	"""
	word_texts = [w["text"] for w in all_words]
	word_boxes = [w["bbox"] for w in all_words]

	encoding = tokenizer(
	word_texts,
	boxes=word_boxes,
	max_length=_MAX_SEQ_LENGTH,
	truncation=True,
	return_tensors="pt",
	# No padding — single sample, variable length matches training dynamic padding
	)

	input_ids = encoding["input_ids"][0] # [seq_len]
	attention_mask = encoding["attention_mask"][0] # [seq_len]
	bbox = encoding["bbox"][0] # [seq_len, 4]

	# Propagate word-level labels to subword tokens
	# word_ids() returns None for special tokens ([CLS], [SEP])
	word_ids = encoding.word_ids(batch_index=0)
	token_labels = torch.tensor([
	word_labels[wid] if (wid is not None and wid < len(word_labels)) else -100
	for wid in word_ids
	])

	# Full features — all tokens
	full_features = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"bbox": bbox,
	}

	# Per-label subsequences — mirrors extract_feature_indices_by_label
	per_label_features: dict[str, dict] = {}
	for label_id, span_name in _LABEL_TO_SPAN:
	mask = (token_labels == label_id)
	if not mask.any():
	# No tokens for this label — training fallback: zeros + ones attention
	per_label_features[span_name] = {
	"input_ids": torch.zeros(1, dtype=torch.long),
	"attention_mask": torch.ones(1, dtype=torch.long),
	"bbox": torch.zeros((1, 4), dtype=torch.long),
	"has_tokens": False,
	}
	else:
	indices = torch.where(mask)[0]
	per_label_features[span_name] = {
	"input_ids": input_ids[indices],
	"attention_mask": attention_mask[indices],
	"bbox": bbox[indices],
	"has_tokens": True,
	}

	return full_features, per_label_features


	# ---------------------------------------------------------------------------
	# Encoding helper
	# ---------------------------------------------------------------------------

	def _encode_features(features: dict, encoder: LayoutLMEncoder) -> list[float]:
	"""Encode a pre-extracted token feature dict into a 768-dim embedding.

	Matches the training encode_all_features() batching approach for batch_size=1:
	no padding needed, just unsqueeze for the batch dimension.
	"""
	input_ids = features["input_ids"].unsqueeze(0).to(_device) # [1, seq_len]
	attention_mask = features["attention_mask"].unsqueeze(0).to(_device)
	bbox = features["bbox"].unsqueeze(0).to(_device)

	with torch.no_grad():
	embedding = encoder(input_ids, attention_mask, bbox, pixel_values=None)

	return embedding[0].cpu().tolist()


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	def encode_resume_spans(parsed: dict) -> dict[str, list[float]]:
	"""Encode all resume feature spans from preprocessed PDF output.

	Args:
	parsed: Output of resume_preprocessor.preprocess_resume_pdf().
	Must contain "all_words" and "word_labels" keys.

	Returns:
	{
	"full": 768-dim embedding of all resume tokens,
	"education": 768-dim embedding of EDUCATION-labeled tokens,
	"experience": 768-dim embedding of EXPERIENCE-labeled tokens,
	"leadership": 768-dim embedding of LEADERSHIP-labeled tokens,
	}
	"""
	encoder, tokenizer = _get_model()

	all_words = parsed.get("all_words", [])
	word_labels = parsed.get("word_labels", [])

	if not all_words:
	zero = [0.0] * 768
	return {
	"full": zero,
	"education": zero,
	"experience": zero,
	"leadership": zero,
	}

	full_features, per_label_features = _tokenize_and_extract_by_label(
	all_words, word_labels, tokenizer
	)

	result: dict[str, list[float]] = {
	"full": _encode_features(full_features, encoder)
	}
	for span_name, features in per_label_features.items():
	result[span_name] = _encode_features(features, encoder)

	return result