File size: 19,837 Bytes

"""
Tiny BERT models for anime filename token classification.

The default linear token-classification head is kept for compatibility.  A
learned linear-chain CRF head is also available for structural sequence-label
training while preserving the same emission logits used by the thin runtime.
"""

from __future__ import annotations

import os
from typing import List, Optional

import torch
from torch import nn
from transformers import BertConfig, BertForTokenClassification, BertModel, BertPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.modeling_utils import PreTrainedModel

from .config import Config
from .labels import infer_legacy_id2label, label_migration_sources


class LinearChainCRF(nn.Module):
    """A small batched linear-chain CRF for BIO token classification."""

    def __init__(self, num_labels: int, id2label: Optional[dict] = None) -> None:
        super().__init__()
        self.num_labels = num_labels
        self.start_transitions = nn.Parameter(torch.zeros(num_labels))
        self.end_transitions = nn.Parameter(torch.zeros(num_labels))
        self.transitions = nn.Parameter(torch.zeros(num_labels, num_labels))
        self.register_buffer("start_allowed", torch.ones(num_labels, dtype=torch.bool))
        self.register_buffer("transition_allowed", torch.ones(num_labels, num_labels, dtype=torch.bool))
        if id2label:
            self._configure_bio_masks(id2label)

    @staticmethod
    def _normalize_label_map(id2label: dict) -> dict[int, str]:
        return {int(label_id): str(label) for label_id, label in id2label.items()}

    def _configure_bio_masks(self, id2label: dict) -> None:
        label_map = self._normalize_label_map(id2label)
        for prev_id in range(self.num_labels):
            prev_label = label_map.get(prev_id, "O")
            self.start_allowed[prev_id] = not prev_label.startswith("I-")
            for next_id in range(self.num_labels):
                next_label = label_map.get(next_id, "O")
                if next_label.startswith("I-"):
                    entity = next_label[2:]
                    allowed = prev_label in {f"B-{entity}", f"I-{entity}"}
                else:
                    allowed = True
                self.transition_allowed[prev_id, next_id] = allowed

    def neg_log_likelihood(
        self,
        emissions: torch.Tensor,
        tags: torch.Tensor,
        mask: torch.Tensor,
    ) -> torch.Tensor:
        """Return mean negative log likelihood for a padded batch."""
        if emissions.ndim != 3:
            raise ValueError("emissions must have shape [batch, seq, labels]")
        if tags.shape != emissions.shape[:2]:
            raise ValueError("tags must have shape [batch, seq]")
        if mask.shape != tags.shape:
            raise ValueError("mask must have shape [batch, seq]")

        mask = mask.bool()
        lengths = mask.long().sum(dim=1)
        if torch.any(lengths == 0):
            raise ValueError("CRF received an empty token sequence")

        safe_tags = tags.masked_fill(~mask, 0)
        log_partition = self._compute_log_partition(emissions, mask)
        gold_score = self._compute_gold_score(emissions, safe_tags, mask, lengths)
        return (log_partition - gold_score).mean()

    def _compute_log_partition(self, emissions: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, _num_labels = emissions.shape
        emissions = emissions.float()
        start_transitions = self.start_transitions.float()
        transition_scores = self.transitions.float()
        scores = start_transitions + emissions[:, 0]

        for idx in range(1, sequence_length):
            next_scores = (
                scores.unsqueeze(2)
                + transition_scores.unsqueeze(0)
                + emissions[:, idx].unsqueeze(1)
            )
            next_scores = torch.logsumexp(next_scores, dim=1)
            scores = torch.where(mask[:, idx].unsqueeze(1), next_scores, scores)

        scores = scores + self.end_transitions
        return torch.logsumexp(scores, dim=1)

    def _compute_gold_score(
        self,
        emissions: torch.Tensor,
        tags: torch.Tensor,
        mask: torch.Tensor,
        lengths: torch.Tensor,
    ) -> torch.Tensor:
        emissions = emissions.float()
        start_transitions = self.start_transitions.float()
        transition_scores = self.transitions.float()
        end_transitions = self.end_transitions.float()
        batch_indices = torch.arange(emissions.shape[0], device=emissions.device)
        score = start_transitions[tags[:, 0]]
        score = score + emissions[batch_indices, 0, tags[:, 0]]

        for idx in range(1, emissions.shape[1]):
            transition_score = transition_scores[tags[:, idx - 1], tags[:, idx]]
            emission_score = emissions[batch_indices, idx, tags[:, idx]]
            score = score + (transition_score + emission_score) * mask[:, idx]

        last_tag_indices = (lengths - 1).unsqueeze(1)
        last_tags = tags.gather(1, last_tag_indices).squeeze(1)
        return score + end_transitions[last_tags]

    def decode(self, emissions: torch.Tensor, mask: torch.Tensor) -> List[List[int]]:
        """Viterbi decode a padded batch and return variable-length label IDs."""
        if emissions.ndim != 3:
            raise ValueError("emissions must have shape [batch, seq, labels]")
        mask = mask.bool()
        lengths = mask.long().sum(dim=1)
        if torch.any(lengths == 0):
            raise ValueError("CRF received an empty token sequence")

        start_transitions = self.start_transitions.masked_fill(~self.start_allowed, float("-inf"))
        transition_scores = self.transitions.masked_fill(~self.transition_allowed, float("-inf"))
        scores = start_transitions + emissions[:, 0]
        history: List[torch.Tensor] = []

        for idx in range(1, emissions.shape[1]):
            next_scores = scores.unsqueeze(2) + transition_scores.unsqueeze(0)
            best_scores, best_tags = next_scores.max(dim=1)
            best_scores = best_scores + emissions[:, idx]
            scores = torch.where(mask[:, idx].unsqueeze(1), best_scores, scores)
            history.append(best_tags)

        scores = scores + self.end_transitions
        best_last_tags = scores.argmax(dim=1)

        paths: List[List[int]] = []
        for batch_idx in range(emissions.shape[0]):
            length = int(lengths[batch_idx].item())
            best_tag = int(best_last_tags[batch_idx].item())
            path = [best_tag]
            for hist in reversed(history[: max(0, length - 1)]):
                best_tag = int(hist[batch_idx, best_tag].item())
                path.append(best_tag)
            path.reverse()
            paths.append(path)
        return paths


class BertCrfForTokenClassification(BertPreTrainedModel):
    """BERT emission classifier trained with a learned CRF sequence loss."""

    config_class = BertConfig

    def __init__(self, config: BertConfig) -> None:
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config, add_pooling_layer=False)
        classifier_dropout = getattr(config, "classifier_dropout", None)
        dropout_prob = classifier_dropout if classifier_dropout is not None else config.hidden_dropout_prob
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = LinearChainCRF(config.num_labels, getattr(config, "id2label", None))
        self.post_init()
        # Keep CRF transitions neutral when bootstrapping from a linear checkpoint.
        nn.init.zeros_(self.crf.start_transitions)
        nn.init.zeros_(self.crf.end_transitions)
        nn.init.zeros_(self.crf.transitions)

    def _crf_inputs(
        self,
        logits: torch.Tensor,
        labels: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
        if logits.shape[1] <= 2:
            raise ValueError("CRF token classification expects CLS, tokens, and SEP positions")
        emissions = logits[:, 1:-1, :]
        if attention_mask is None:
            if labels is None:
                mask = torch.ones(emissions.shape[:2], dtype=torch.bool, device=logits.device)
            else:
                mask = labels[:, 1:-1].ne(-100)
        else:
            if labels is None:
                real_lengths = attention_mask.long().sum(dim=1).clamp_min(2) - 2
                positions = torch.arange(emissions.shape[1], device=logits.device).unsqueeze(0)
                mask = positions < real_lengths.unsqueeze(1)
            else:
                mask = attention_mask[:, 1:-1].bool()
                mask = mask & labels[:, 1:-1].ne(-100)
        tags = labels[:, 1:-1] if labels is not None else None
        return emissions, tags, mask

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> TokenClassifierOutput:
        return_dict = return_dict if return_dict is not None else getattr(self.config, "return_dict", True)
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            emissions, tags, mask = self._crf_inputs(logits, labels, attention_mask)
            if tags is None:
                raise ValueError("labels are required for CRF loss")
            loss = self.crf.neg_log_likelihood(emissions, tags, mask)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def decode(self, logits: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> List[List[int]]:
        """Decode full-sequence logits, excluding CLS/SEP and padding positions."""
        emissions, _tags, mask = self._crf_inputs(logits, None, attention_mask)
        return self.crf.decode(emissions, mask)


def build_bert_config(config: Config) -> BertConfig:
    """Build the Hugging Face BERT config shared by both model heads."""
    return BertConfig(
        vocab_size=config.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        max_position_embeddings=config.max_position_embeddings,
        num_labels=config.num_labels,
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        id2label=config.id2label,
        label2id=config.label2id,
        label_schema_version=config.label_schema_version,
    )


def normalize_model_head(model_head: Optional[str]) -> str:
    head = (model_head or "linear").strip().lower()
    if head not in {"linear", "crf"}:
        raise ValueError(f"Unsupported model head: {model_head}")
    return head


def create_model(config: Config, model_head: str = "linear") -> PreTrainedModel:
    """
    Create a Tiny BERT model for token classification.

    Args:
        config: Config object with model hyperparameters.
        model_head: ``linear`` for Hugging Face's standard token classifier or
            ``crf`` for a learned linear-chain CRF sequence head.
    """
    head = normalize_model_head(model_head)
    bert_config = build_bert_config(config)
    bert_config.model_head = head
    if head == "crf":
        bert_config.architectures = ["BertCrfForTokenClassification"]
        return BertCrfForTokenClassification(bert_config)
    bert_config.architectures = ["BertForTokenClassification"]
    return BertForTokenClassification(bert_config)


def infer_model_head(config: BertConfig) -> str:
    head = getattr(config, "model_head", None)
    if head:
        return normalize_model_head(str(head))
    architectures = getattr(config, "architectures", None) or []
    if any("Crf" in str(architecture) or "CRF" in str(architecture) for architecture in architectures):
        return "crf"
    return "linear"


def load_model(model_dir: str, model_head: Optional[str] = None) -> PreTrainedModel:
    """Load a linear or CRF token classifier from a Hugging Face checkpoint."""
    config = BertConfig.from_pretrained(model_dir)
    head = normalize_model_head(model_head) if model_head is not None else infer_model_head(config)
    if head == "crf":
        return BertCrfForTokenClassification.from_pretrained(model_dir)
    return BertForTokenClassification.from_pretrained(model_dir)


def _model_id2label_for_migration(model: PreTrainedModel) -> dict[int, str]:
    raw_id2label = getattr(model.config, "id2label", None) or {}
    normalized = {int(label_id): str(label) for label_id, label in raw_id2label.items()}
    classifier = getattr(model, "classifier", None)
    out_features = getattr(classifier, "out_features", None)
    if out_features is not None and len(normalized) != int(out_features):
        inferred = infer_legacy_id2label(int(out_features))
        if inferred is not None:
            return inferred
    return normalized


def migrate_token_classifier_labels(
    model: PreTrainedModel,
    target_label2id: dict[str, int],
    target_id2label: dict[int, str],
) -> dict[str, object]:
    """
    Expand or reorder token-classification label rows for the shared schema.

    Exact labels are copied by name. Legacy 15-label TITLE rows initialize all
    title-like rows, and legacy SEASON rows initialize PATH_SEASON.
    """
    classifier = getattr(model, "classifier", None)
    if classifier is None or not isinstance(classifier, nn.Linear):
        return {"changed": False, "reason": "no_linear_classifier"}

    target_id2label = {int(label_id): str(label) for label_id, label in target_id2label.items()}
    target_label2id = {str(label): int(label_id) for label, label_id in target_label2id.items()}
    old_id2label = _model_id2label_for_migration(model)
    old_label2id = {label: label_id for label_id, label in old_id2label.items()}
    old_num_labels = int(classifier.out_features)
    new_num_labels = len(target_label2id)

    same_schema = (
        old_num_labels == new_num_labels
        and all(old_id2label.get(idx) == target_id2label.get(idx) for idx in range(new_num_labels))
    )
    if same_schema:
        model.config.num_labels = new_num_labels
        model.config.id2label = target_id2label
        model.config.label2id = target_label2id
        return {"changed": False, "copied": new_num_labels, "target_labels": new_num_labels}

    old_weight = classifier.weight.detach()
    old_bias = classifier.bias.detach() if classifier.bias is not None else None
    new_classifier = nn.Linear(
        classifier.in_features,
        new_num_labels,
        bias=classifier.bias is not None,
        device=old_weight.device,
        dtype=old_weight.dtype,
    )
    nn.init.normal_(
        new_classifier.weight,
        mean=0.0,
        std=getattr(model.config, "initializer_range", 0.02),
    )
    if new_classifier.bias is not None:
        nn.init.zeros_(new_classifier.bias)

    row_sources: dict[int, int] = {}
    copied = 0
    for target_label, target_id in target_label2id.items():
        for source_label in label_migration_sources(target_label):
            source_id = old_label2id.get(source_label)
            if source_id is None or source_id >= old_num_labels:
                continue
            new_classifier.weight.data[target_id].copy_(old_weight[source_id])
            if new_classifier.bias is not None and old_bias is not None:
                new_classifier.bias.data[target_id].copy_(old_bias[source_id])
            row_sources[target_id] = source_id
            copied += 1
            break

    model.classifier = new_classifier
    model.num_labels = new_num_labels
    model.config.num_labels = new_num_labels
    model.config.id2label = target_id2label
    model.config.label2id = target_label2id

    if hasattr(model, "crf"):
        old_crf = model.crf
        new_crf = LinearChainCRF(new_num_labels, target_id2label).to(
            device=old_weight.device,
            dtype=old_weight.dtype,
        )
        nn.init.zeros_(new_crf.start_transitions)
        nn.init.zeros_(new_crf.end_transitions)
        nn.init.zeros_(new_crf.transitions)
        with torch.no_grad():
            for target_id, source_id in row_sources.items():
                if source_id < old_crf.start_transitions.shape[0]:
                    new_crf.start_transitions[target_id].copy_(old_crf.start_transitions[source_id])
                    new_crf.end_transitions[target_id].copy_(old_crf.end_transitions[source_id])
            for target_to_id, source_to_id in row_sources.items():
                for target_from_id, source_from_id in row_sources.items():
                    if (
                        source_from_id < old_crf.transitions.shape[0]
                        and source_to_id < old_crf.transitions.shape[1]
                    ):
                        new_crf.transitions[target_from_id, target_to_id].copy_(
                            old_crf.transitions[source_from_id, source_to_id]
                        )
        model.crf = new_crf

    return {
        "changed": True,
        "source_labels": old_num_labels,
        "target_labels": new_num_labels,
        "copied": copied,
    }


def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
    """Persist the selected head in config.json for later auto-loading."""
    head = normalize_model_head(model_head)
    model.config.model_head = head
    model.config.architectures = [
        "BertCrfForTokenClassification" if head == "crf" else "BertForTokenClassification"
    ]


def count_parameters(model) -> int:
    """Count total trainable parameters in a model."""
    return sum(p.numel() for p in model.parameters())


def print_model_summary(model):
    """Print model architecture summary with parameter count."""
    total_params = count_parameters(model)
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Parameter limit: 5,000,000")
    if total_params < 5_000_000:
        print(f"[OK] Within 5M limit ({(5_000_000 - total_params):,} remaining)")
    else:
        print(f"[FAIL] Exceeds 5M limit by {total_params - 5_000_000:,}")
    return total_params


if __name__ == "__main__":
    cfg = Config()
    cfg.vocab_size = 3000
    model = create_model(cfg, model_head=os.environ.get("ANIFILEBERT_MODEL_HEAD", "linear"))
    print_model_summary(model)