applai-layoutlmv3 / layoutlm_service.py
Smutypi3's picture
Upload folder using huggingface_hub
d9517d2 verified
"""LayoutLM service β€” LayoutLMv3-based resume encoder.
Architecture (exact match to layoutlmv3_final.ipynb training):
- Backbone: microsoft/layoutlmv3-base (HuggingFace)
- Pooling: mean pooling over last hidden state with attention mask
- Normalize: L2 normalisation after pooling
- Output: 768-dim embeddings
Weights: backend/ai_models/layoutlmv3_best_model.pt
Inference approach β€” mirrors extract_feature_indices_by_label from training:
1. Tokenize the FULL resume (all words + real per-word bboxes) once.
2. Propagate word-level section labels (0=O, 1=EDU, 2=EXP, 3=LEAD)
to subword tokens via the Fast tokenizer's word_ids().
3. For each feature span, extract the subsequence of tokens whose label
matches β€” identical to how training built per-label embedding sets.
4. Encode each subsequence through LayoutLMEncoder (pixel_values=None).
Fallback (no tokens for a label): single zero token with attention_mask=1,
matching the training notebook's extract_feature_indices_by_label fallback.
"""
from __future__ import annotations
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LayoutLMv3Model, LayoutLMv3TokenizerFast
from ai_models.preprocessing.resume_preprocessor import (
LABEL_EDUCATION,
LABEL_EXPERIENCE,
LABEL_LEADERSHIP,
)
_WEIGHTS_PATH = Path(__file__).parent.parent / "layoutlm" / "layoutlmv3_best_model.pt"
_MODEL_NAME = "microsoft/layoutlmv3-base"
_MAX_SEQ_LENGTH = 512 # matches training CONFIG['max_seq_length']
_NORMALIZE = True # matches training CONFIG['normalize_embeddings']
# Feature extraction order matches training feature_collections keys
_LABEL_TO_SPAN: list[tuple[int, str]] = [
(LABEL_EDUCATION, "education"),
(LABEL_EXPERIENCE, "experience"),
(LABEL_LEADERSHIP, "leadership"),
]
_encoder: LayoutLMEncoder | None = None
_tokenizer: LayoutLMv3TokenizerFast | None = None
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ---------------------------------------------------------------------------
# Model definition β€” exact copy from layoutlmv3_final.ipynb
# ---------------------------------------------------------------------------
class LayoutLMEncoder(nn.Module):
"""LayoutLMv3 backbone + mean-pooling head."""
def __init__(self, model_name: str = _MODEL_NAME, pooling: str = "mean") -> None:
super().__init__()
self.backbone = LayoutLMv3Model.from_pretrained(model_name)
self.pooling = pooling
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
bbox: torch.Tensor,
pixel_values: torch.Tensor | None = None,
) -> torch.Tensor:
if pixel_values is not None:
outputs = self.backbone(
input_ids=input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
output_hidden_states=True,
)
else:
outputs = self.backbone(
input_ids=input_ids,
attention_mask=attention_mask,
bbox=bbox,
output_hidden_states=True,
)
if self.pooling == "cls":
embeddings = outputs.last_hidden_state[:, 0, :]
else: # mean pooling
last_hidden = outputs.last_hidden_state # [B, seq_len, 768]
seq_len = last_hidden.shape[1]
# Align attention mask to actual model output length
if attention_mask.shape[1] != seq_len:
if attention_mask.shape[1] > seq_len:
attention_mask = attention_mask[:, :seq_len]
else:
pad_len = seq_len - attention_mask.shape[1]
attention_mask = torch.cat(
[
attention_mask,
torch.zeros(
attention_mask.shape[0],
pad_len,
dtype=attention_mask.dtype,
device=attention_mask.device,
),
],
dim=1,
)
mask_expanded = attention_mask.unsqueeze(-1).float() # [B, seq_len, 1]
embeddings = (
(last_hidden * mask_expanded).sum(dim=1)
/ mask_expanded.sum(dim=1).clamp(min=1e-9)
)
if _NORMALIZE:
embeddings = F.normalize(embeddings, dim=-1)
return embeddings
# ---------------------------------------------------------------------------
# Lazy model loader
# ---------------------------------------------------------------------------
def _get_model() -> tuple[LayoutLMEncoder, LayoutLMv3TokenizerFast]:
global _encoder, _tokenizer
if _encoder is None:
if not _WEIGHTS_PATH.exists():
raise FileNotFoundError(
f"LayoutLMv3 weights not found: {_WEIGHTS_PATH}\n"
"Upload the file to that path (or to HuggingFace Hub and load via hf_hub_download)."
)
_encoder = LayoutLMEncoder(_MODEL_NAME, pooling="mean").to(_device)
state = torch.load(str(_WEIGHTS_PATH), map_location=_device, weights_only=True)
if isinstance(state, dict) and "model_state_dict" in state:
state = state["model_state_dict"]
_encoder.load_state_dict(state, strict=False)
_encoder.eval()
if _tokenizer is None:
_tokenizer = LayoutLMv3TokenizerFast.from_pretrained(_MODEL_NAME)
return _encoder, _tokenizer
# ---------------------------------------------------------------------------
# Token-level feature extraction β€” mirrors extract_feature_indices_by_label
# ---------------------------------------------------------------------------
def _tokenize_and_extract_by_label(
all_words: list[dict],
word_labels: list[int],
tokenizer: LayoutLMv3TokenizerFast,
) -> tuple[dict, dict[str, dict]]:
"""Tokenize the full resume once, then extract per-label token subsequences.
Mirrors the training notebook pipeline:
1. Tokenize all words with their real bboxes.
2. Propagate word-level label IDs to subword tokens via word_ids()
(special tokens [CLS]/[SEP]/padding receive label -100).
3. For each label ID, collect matching token indices and slice
input_ids / attention_mask / bbox β€” exactly as
extract_feature_indices_by_label does in training.
Fallback when no tokens match a label:
input_ids: zeros(1) β€” same as training
attention_mask: ones(1) β€” same as training (not zeros!)
bbox: zeros(1,4) β€” same as training
Args:
all_words: output of extract_words_from_pdf β€” [{text, bbox}, ...]
word_labels: output of assign_word_labels β€” one int per word
Returns:
(full_features, per_label_features)
full_features: dict with input_ids/attention_mask/bbox for all tokens
per_label_features: dict mapping span name β†’ feature dict
"""
word_texts = [w["text"] for w in all_words]
word_boxes = [w["bbox"] for w in all_words]
encoding = tokenizer(
word_texts,
boxes=word_boxes,
max_length=_MAX_SEQ_LENGTH,
truncation=True,
return_tensors="pt",
# No padding β€” single sample, variable length matches training dynamic padding
)
input_ids = encoding["input_ids"][0] # [seq_len]
attention_mask = encoding["attention_mask"][0] # [seq_len]
bbox = encoding["bbox"][0] # [seq_len, 4]
# Propagate word-level labels to subword tokens
# word_ids() returns None for special tokens ([CLS], [SEP])
word_ids = encoding.word_ids(batch_index=0)
token_labels = torch.tensor([
word_labels[wid] if (wid is not None and wid < len(word_labels)) else -100
for wid in word_ids
])
# Full features β€” all tokens
full_features = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"bbox": bbox,
}
# Per-label subsequences β€” mirrors extract_feature_indices_by_label
per_label_features: dict[str, dict] = {}
for label_id, span_name in _LABEL_TO_SPAN:
mask = (token_labels == label_id)
if not mask.any():
# No tokens for this label β€” training fallback: zeros + ones attention
per_label_features[span_name] = {
"input_ids": torch.zeros(1, dtype=torch.long),
"attention_mask": torch.ones(1, dtype=torch.long),
"bbox": torch.zeros((1, 4), dtype=torch.long),
"has_tokens": False,
}
else:
indices = torch.where(mask)[0]
per_label_features[span_name] = {
"input_ids": input_ids[indices],
"attention_mask": attention_mask[indices],
"bbox": bbox[indices],
"has_tokens": True,
}
return full_features, per_label_features
# ---------------------------------------------------------------------------
# Encoding helper
# ---------------------------------------------------------------------------
def _encode_features(features: dict, encoder: LayoutLMEncoder) -> list[float]:
"""Encode a pre-extracted token feature dict into a 768-dim embedding.
Matches the training encode_all_features() batching approach for batch_size=1:
no padding needed, just unsqueeze for the batch dimension.
"""
input_ids = features["input_ids"].unsqueeze(0).to(_device) # [1, seq_len]
attention_mask = features["attention_mask"].unsqueeze(0).to(_device)
bbox = features["bbox"].unsqueeze(0).to(_device)
with torch.no_grad():
embedding = encoder(input_ids, attention_mask, bbox, pixel_values=None)
return embedding[0].cpu().tolist()
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def encode_resume_spans(parsed: dict) -> dict[str, list[float]]:
"""Encode all resume feature spans from preprocessed PDF output.
Args:
parsed: Output of resume_preprocessor.preprocess_resume_pdf().
Must contain "all_words" and "word_labels" keys.
Returns:
{
"full": 768-dim embedding of all resume tokens,
"education": 768-dim embedding of EDUCATION-labeled tokens,
"experience": 768-dim embedding of EXPERIENCE-labeled tokens,
"leadership": 768-dim embedding of LEADERSHIP-labeled tokens,
}
"""
encoder, tokenizer = _get_model()
all_words = parsed.get("all_words", [])
word_labels = parsed.get("word_labels", [])
if not all_words:
zero = [0.0] * 768
return {
"full": zero,
"education": zero,
"experience": zero,
"leadership": zero,
}
full_features, per_label_features = _tokenize_and_extract_by_label(
all_words, word_labels, tokenizer
)
result: dict[str, list[float]] = {
"full": _encode_features(full_features, encoder)
}
for span_name, features in per_label_features.items():
result[span_name] = _encode_features(features, encoder)
return result