| """LayoutLM service β LayoutLMv3-based resume encoder. |
| |
| Architecture (exact match to layoutlmv3_final.ipynb training): |
| - Backbone: microsoft/layoutlmv3-base (HuggingFace) |
| - Pooling: mean pooling over last hidden state with attention mask |
| - Normalize: L2 normalisation after pooling |
| - Output: 768-dim embeddings |
| |
| Weights: backend/ai_models/layoutlmv3_best_model.pt |
| |
| Inference approach β mirrors extract_feature_indices_by_label from training: |
| 1. Tokenize the FULL resume (all words + real per-word bboxes) once. |
| 2. Propagate word-level section labels (0=O, 1=EDU, 2=EXP, 3=LEAD) |
| to subword tokens via the Fast tokenizer's word_ids(). |
| 3. For each feature span, extract the subsequence of tokens whose label |
| matches β identical to how training built per-label embedding sets. |
| 4. Encode each subsequence through LayoutLMEncoder (pixel_values=None). |
| |
| Fallback (no tokens for a label): single zero token with attention_mask=1, |
| matching the training notebook's extract_feature_indices_by_label fallback. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import LayoutLMv3Model, LayoutLMv3TokenizerFast |
|
|
| from ai_models.preprocessing.resume_preprocessor import ( |
| LABEL_EDUCATION, |
| LABEL_EXPERIENCE, |
| LABEL_LEADERSHIP, |
| ) |
|
|
| _WEIGHTS_PATH = Path(__file__).parent.parent / "layoutlm" / "layoutlmv3_best_model.pt" |
| _MODEL_NAME = "microsoft/layoutlmv3-base" |
| _MAX_SEQ_LENGTH = 512 |
| _NORMALIZE = True |
|
|
| |
| _LABEL_TO_SPAN: list[tuple[int, str]] = [ |
| (LABEL_EDUCATION, "education"), |
| (LABEL_EXPERIENCE, "experience"), |
| (LABEL_LEADERSHIP, "leadership"), |
| ] |
|
|
| _encoder: LayoutLMEncoder | None = None |
| _tokenizer: LayoutLMv3TokenizerFast | None = None |
| _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| |
| |
| |
|
|
| class LayoutLMEncoder(nn.Module): |
| """LayoutLMv3 backbone + mean-pooling head.""" |
|
|
| def __init__(self, model_name: str = _MODEL_NAME, pooling: str = "mean") -> None: |
| super().__init__() |
| self.backbone = LayoutLMv3Model.from_pretrained(model_name) |
| self.pooling = pooling |
|
|
| def forward( |
| self, |
| input_ids: torch.Tensor, |
| attention_mask: torch.Tensor, |
| bbox: torch.Tensor, |
| pixel_values: torch.Tensor | None = None, |
| ) -> torch.Tensor: |
| if pixel_values is not None: |
| outputs = self.backbone( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| bbox=bbox, |
| pixel_values=pixel_values, |
| output_hidden_states=True, |
| ) |
| else: |
| outputs = self.backbone( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| bbox=bbox, |
| output_hidden_states=True, |
| ) |
|
|
| if self.pooling == "cls": |
| embeddings = outputs.last_hidden_state[:, 0, :] |
| else: |
| last_hidden = outputs.last_hidden_state |
| seq_len = last_hidden.shape[1] |
|
|
| |
| if attention_mask.shape[1] != seq_len: |
| if attention_mask.shape[1] > seq_len: |
| attention_mask = attention_mask[:, :seq_len] |
| else: |
| pad_len = seq_len - attention_mask.shape[1] |
| attention_mask = torch.cat( |
| [ |
| attention_mask, |
| torch.zeros( |
| attention_mask.shape[0], |
| pad_len, |
| dtype=attention_mask.dtype, |
| device=attention_mask.device, |
| ), |
| ], |
| dim=1, |
| ) |
|
|
| mask_expanded = attention_mask.unsqueeze(-1).float() |
| embeddings = ( |
| (last_hidden * mask_expanded).sum(dim=1) |
| / mask_expanded.sum(dim=1).clamp(min=1e-9) |
| ) |
|
|
| if _NORMALIZE: |
| embeddings = F.normalize(embeddings, dim=-1) |
|
|
| return embeddings |
|
|
|
|
| |
| |
| |
|
|
| def _get_model() -> tuple[LayoutLMEncoder, LayoutLMv3TokenizerFast]: |
| global _encoder, _tokenizer |
|
|
| if _encoder is None: |
| if not _WEIGHTS_PATH.exists(): |
| raise FileNotFoundError( |
| f"LayoutLMv3 weights not found: {_WEIGHTS_PATH}\n" |
| "Upload the file to that path (or to HuggingFace Hub and load via hf_hub_download)." |
| ) |
| _encoder = LayoutLMEncoder(_MODEL_NAME, pooling="mean").to(_device) |
| state = torch.load(str(_WEIGHTS_PATH), map_location=_device, weights_only=True) |
| if isinstance(state, dict) and "model_state_dict" in state: |
| state = state["model_state_dict"] |
| _encoder.load_state_dict(state, strict=False) |
| _encoder.eval() |
|
|
| if _tokenizer is None: |
| _tokenizer = LayoutLMv3TokenizerFast.from_pretrained(_MODEL_NAME) |
|
|
| return _encoder, _tokenizer |
|
|
|
|
| |
| |
| |
|
|
| def _tokenize_and_extract_by_label( |
| all_words: list[dict], |
| word_labels: list[int], |
| tokenizer: LayoutLMv3TokenizerFast, |
| ) -> tuple[dict, dict[str, dict]]: |
| """Tokenize the full resume once, then extract per-label token subsequences. |
| |
| Mirrors the training notebook pipeline: |
| 1. Tokenize all words with their real bboxes. |
| 2. Propagate word-level label IDs to subword tokens via word_ids() |
| (special tokens [CLS]/[SEP]/padding receive label -100). |
| 3. For each label ID, collect matching token indices and slice |
| input_ids / attention_mask / bbox β exactly as |
| extract_feature_indices_by_label does in training. |
| |
| Fallback when no tokens match a label: |
| input_ids: zeros(1) β same as training |
| attention_mask: ones(1) β same as training (not zeros!) |
| bbox: zeros(1,4) β same as training |
| |
| Args: |
| all_words: output of extract_words_from_pdf β [{text, bbox}, ...] |
| word_labels: output of assign_word_labels β one int per word |
| |
| Returns: |
| (full_features, per_label_features) |
| full_features: dict with input_ids/attention_mask/bbox for all tokens |
| per_label_features: dict mapping span name β feature dict |
| """ |
| word_texts = [w["text"] for w in all_words] |
| word_boxes = [w["bbox"] for w in all_words] |
|
|
| encoding = tokenizer( |
| word_texts, |
| boxes=word_boxes, |
| max_length=_MAX_SEQ_LENGTH, |
| truncation=True, |
| return_tensors="pt", |
| |
| ) |
|
|
| input_ids = encoding["input_ids"][0] |
| attention_mask = encoding["attention_mask"][0] |
| bbox = encoding["bbox"][0] |
|
|
| |
| |
| word_ids = encoding.word_ids(batch_index=0) |
| token_labels = torch.tensor([ |
| word_labels[wid] if (wid is not None and wid < len(word_labels)) else -100 |
| for wid in word_ids |
| ]) |
|
|
| |
| full_features = { |
| "input_ids": input_ids, |
| "attention_mask": attention_mask, |
| "bbox": bbox, |
| } |
|
|
| |
| per_label_features: dict[str, dict] = {} |
| for label_id, span_name in _LABEL_TO_SPAN: |
| mask = (token_labels == label_id) |
| if not mask.any(): |
| |
| per_label_features[span_name] = { |
| "input_ids": torch.zeros(1, dtype=torch.long), |
| "attention_mask": torch.ones(1, dtype=torch.long), |
| "bbox": torch.zeros((1, 4), dtype=torch.long), |
| "has_tokens": False, |
| } |
| else: |
| indices = torch.where(mask)[0] |
| per_label_features[span_name] = { |
| "input_ids": input_ids[indices], |
| "attention_mask": attention_mask[indices], |
| "bbox": bbox[indices], |
| "has_tokens": True, |
| } |
|
|
| return full_features, per_label_features |
|
|
|
|
| |
| |
| |
|
|
| def _encode_features(features: dict, encoder: LayoutLMEncoder) -> list[float]: |
| """Encode a pre-extracted token feature dict into a 768-dim embedding. |
| |
| Matches the training encode_all_features() batching approach for batch_size=1: |
| no padding needed, just unsqueeze for the batch dimension. |
| """ |
| input_ids = features["input_ids"].unsqueeze(0).to(_device) |
| attention_mask = features["attention_mask"].unsqueeze(0).to(_device) |
| bbox = features["bbox"].unsqueeze(0).to(_device) |
|
|
| with torch.no_grad(): |
| embedding = encoder(input_ids, attention_mask, bbox, pixel_values=None) |
|
|
| return embedding[0].cpu().tolist() |
|
|
|
|
| |
| |
| |
|
|
| def encode_resume_spans(parsed: dict) -> dict[str, list[float]]: |
| """Encode all resume feature spans from preprocessed PDF output. |
| |
| Args: |
| parsed: Output of resume_preprocessor.preprocess_resume_pdf(). |
| Must contain "all_words" and "word_labels" keys. |
| |
| Returns: |
| { |
| "full": 768-dim embedding of all resume tokens, |
| "education": 768-dim embedding of EDUCATION-labeled tokens, |
| "experience": 768-dim embedding of EXPERIENCE-labeled tokens, |
| "leadership": 768-dim embedding of LEADERSHIP-labeled tokens, |
| } |
| """ |
| encoder, tokenizer = _get_model() |
|
|
| all_words = parsed.get("all_words", []) |
| word_labels = parsed.get("word_labels", []) |
|
|
| if not all_words: |
| zero = [0.0] * 768 |
| return { |
| "full": zero, |
| "education": zero, |
| "experience": zero, |
| "leadership": zero, |
| } |
|
|
| full_features, per_label_features = _tokenize_and_extract_by_label( |
| all_words, word_labels, tokenizer |
| ) |
|
|
| result: dict[str, list[float]] = { |
| "full": _encode_features(full_features, encoder) |
| } |
| for span_name, features in per_label_features.items(): |
| result[span_name] = _encode_features(features, encoder) |
|
|
| return result |
|
|