File size: 11,273 Bytes

decba57

"""
Unified HuggingFace-compatible quality classifier model.

Merges mmBERT encoder with trained MLP classifier head into a single
PreTrainedModel that can be saved/loaded using standard HuggingFace methods
and used with vLLM for efficient inference.

Example:
    # Merge trained classifier into unified model
    from src.hq.merged_model import merge_and_save
    merge_and_save(
        base_model_name="jhu-clsp/mmBERT-small",
        classifier_weights_path="./output/models/ara_Arab.pt",
        output_dir="./release/arabic-quality-classifier"
    )

    # Load and use
    model = QualityClassifierModel.from_pretrained("./release/arabic-quality-classifier")
    tokenizer = AutoTokenizer.from_pretrained("./release/arabic-quality-classifier")
"""
import os
from pathlib import Path
from typing import Optional, Union

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import SequenceClassifierOutput

from .config import EMBEDDING_CONFIG, TRAINING_CONFIG


class QualityClassifierConfig(PretrainedConfig):
    """Configuration for the unified quality classifier model."""

    model_type = "quality_classifier"

    def __init__(
        self,
        base_model_name: str = None,
        hidden_dim: int = None,
        dropout: float = None,
        num_labels: int = 1,
        **kwargs
    ):
        """
        Initialize configuration.

        Args:
            base_model_name: HuggingFace model ID for the encoder
            hidden_dim: Hidden dimension of the MLP classifier
            dropout: Dropout probability
            num_labels: Number of output labels (1 for binary)
        """
        super().__init__(**kwargs)
        self.base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
        self.hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
        self.dropout = dropout or TRAINING_CONFIG["dropout"]
        self.num_labels = num_labels


class QualityClassifierModel(PreTrainedModel):
    """
    Unified quality classifier combining mmBERT encoder with MLP head.

    This model can be saved and loaded using standard HuggingFace methods:
        model.save_pretrained("path/to/model")
        model = QualityClassifierModel.from_pretrained("path/to/model")

    It can also be used with vLLM for efficient inference since mmBERT
    is supported.

    Architecture:
        - Encoder: mmBERT (small or base)
        - Pooling: Mean pooling over sequence
        - Classifier: Linear(768->256) -> ReLU -> Dropout(0.2) -> Linear(256->1) -> Sigmoid
    """

    config_class = QualityClassifierConfig

    def __init__(self, config: QualityClassifierConfig):
        """
        Initialize the unified model.

        Args:
            config: QualityClassifierConfig instance
        """
        super().__init__(config)

        # Load base encoder with eager attention to avoid flash_attn issues
        self.encoder = AutoModel.from_pretrained(
            config.base_model_name,
            attn_implementation="eager",
        )
        hidden_size = self.encoder.config.hidden_size

        # Classification head (matches standalone training architecture)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, config.hidden_dim),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.hidden_dim, config.num_labels),
            nn.Sigmoid()
        )

        self.post_init()

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: bool = True,
    ) -> SequenceClassifierOutput:
        """
        Forward pass with optional loss computation.

        Args:
            input_ids: Token IDs of shape (batch_size, seq_length)
            attention_mask: Attention mask of shape (batch_size, seq_length)
            token_type_ids: Token type IDs (unused for mmBERT)
            labels: Ground truth labels for loss computation
            return_dict: Whether to return a SequenceClassifierOutput

        Returns:
            SequenceClassifierOutput with loss, logits, and hidden states
        """
        # Encode
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        # Mean pooling
        token_embeddings = outputs.last_hidden_state
        if attention_mask is not None:
            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
            pooled = sum_embeddings / sum_mask
        else:
            pooled = token_embeddings.mean(dim=1)

        # Classify
        logits = self.classifier(pooled)

        # Compute loss if labels provided
        loss = None
        if labels is not None:
            loss_fn = nn.BCELoss()
            loss = loss_fn(logits.squeeze(), labels.float())

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def predict(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        Convenience method for inference.

        Args:
            input_ids: Token IDs
            attention_mask: Attention mask

        Returns:
            Quality scores in range [0, 1]
        """
        self.eval()
        with torch.no_grad():
            outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
            return outputs.logits.squeeze()

    def score_texts(
        self,
        texts: list,
        tokenizer: AutoTokenizer,
        batch_size: int = 32,
        max_length: int = 512,
        device: str = None,
    ) -> list:
        """
        Score a list of texts.

        Args:
            texts: List of text strings to score
            tokenizer: Tokenizer for the model
            batch_size: Batch size for processing
            max_length: Maximum sequence length
            device: Device to use for inference

        Returns:
            List of quality scores in range [0, 1]
        """
        device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)
        self.eval()

        scores = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                max_length=max_length,
                truncation=True,
                padding=True,
            ).to(device)

            with torch.no_grad():
                outputs = self.forward(**inputs)
                batch_scores = outputs.logits.squeeze().cpu().tolist()

            # Handle single item case
            if isinstance(batch_scores, float):
                batch_scores = [batch_scores]

            scores.extend(batch_scores)

        return scores


def merge_and_save(
    base_model_name: str,
    classifier_weights_path: Union[str, Path],
    output_dir: Union[str, Path],
    hidden_dim: int = None,
    dropout: float = None,
) -> QualityClassifierModel:
    """
    Merge encoder and trained classifier head, then save as unified model.

    The resulting model can be loaded with:
        model = QualityClassifierModel.from_pretrained(output_dir)

    Args:
        base_model_name: HuggingFace model ID for the encoder
        classifier_weights_path: Path to trained MLP weights (.pt file)
        output_dir: Directory to save the merged model
        hidden_dim: Hidden dimension of the MLP (must match training)
        dropout: Dropout rate (must match training)

    Returns:
        The merged QualityClassifierModel
    """
    hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
    dropout = dropout or TRAINING_CONFIG["dropout"]
    output_dir = Path(output_dir)

    print(f"Merging model...")
    print(f"  Encoder: {base_model_name}")
    print(f"  Classifier: {classifier_weights_path}")

    # Create config
    config = QualityClassifierConfig(
        base_model_name=base_model_name,
        hidden_dim=hidden_dim,
        dropout=dropout,
        num_labels=1
    )

    # Initialize model (loads encoder from HuggingFace)
    model = QualityClassifierModel(config)

    # Load trained classifier weights
    checkpoint = torch.load(classifier_weights_path, map_location="cpu")

    # Handle both new format (dict with state_dict) and old format (just state_dict)
    if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
        trained_weights = checkpoint["state_dict"]
    else:
        trained_weights = checkpoint

    # Map weights from standalone MLP to integrated classifier
    # The standalone model saves with "classifier." prefix, strip it
    stripped_weights = {}
    for key, value in trained_weights.items():
        new_key = key.replace("classifier.", "") if key.startswith("classifier.") else key
        stripped_weights[new_key] = value

    model.classifier.load_state_dict(stripped_weights)

    # Save everything
    output_dir.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(output_dir)

    # Also save tokenizer for convenience
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.save_pretrained(output_dir)

    print(f"Model saved to {output_dir}")
    print(f"Contents: {list(output_dir.iterdir())}")

    return model


def merge_all_classifiers(
    models_dir: Union[str, Path],
    output_base_dir: Union[str, Path],
    base_model_name: str = None,
) -> dict:
    """
    Merge all trained classifiers into unified models.

    Args:
        models_dir: Directory containing trained .pt files
        output_base_dir: Base directory for output models
        base_model_name: HuggingFace model ID for the encoder

    Returns:
        Dictionary mapping language codes to output directories
    """
    base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
    models_dir = Path(models_dir)
    output_base_dir = Path(output_base_dir)

    results = {}

    for pt_file in models_dir.glob("*.pt"):
        lang_code = pt_file.stem  # e.g., "ara_Arab"
        output_dir = output_base_dir / f"{lang_code}-quality-classifier"

        print(f"\n{'=' * 50}")
        print(f"Processing: {lang_code}")
        print(f"{'=' * 50}")

        merge_and_save(
            base_model_name=base_model_name,
            classifier_weights_path=pt_file,
            output_dir=output_dir,
        )

        results[lang_code] = str(output_dir)

    return results


# Register the model for auto-loading
# This allows: AutoModel.from_pretrained("path") to work
QualityClassifierConfig.register_for_auto_class()
QualityClassifierModel.register_for_auto_class("AutoModel")