"""
bert_ordinal.py
---------------
BERT-based ordinal regression model, fully integrated with the HuggingFace
Transformers API:

    model.save_pretrained("my-checkpoint/")
    model = BertOrdinal.from_pretrained("my-checkpoint/")

Architecture
------------
1. A (optionally frozen) BERT backbone.
2. A projection head on the [CLS] token:
       Linear(hidden_size → hidden_dim) → ReLU → Dropout(p) → Linear(hidden_dim → 1)
   producing a single latent score s ∈ ℝ.
3. K-1 learnable raw_threshold parameters enforcing monotonicity via
   cumsum(softplus(·)).
4. Cumulative-link probabilities:
       P(Y ≤ j | x) = σ(θ_j − s)

Usage
-----
    from bert_ordinal import BertOrdinalConfig, BertOrdinal

    # ── Create from scratch ──────────────────────────────────────────────────
    cfg = BertOrdinalConfig(
        bert_model_name="bert-base-uncased",
        num_classes=3,
        hidden_dim=128,
        dropout=0.1,
        freeze_bert=True,
    )
    model = BertOrdinal(cfg)

    # ── Save ────────────────────────────────────────────────────────────────
    model.save_pretrained("my-checkpoint/")
    tokenizer.save_pretrained("my-checkpoint/")   # keep tokenizer alongside

    # ── Reload ──────────────────────────────────────────────────────────────
    model     = BertOrdinal.from_pretrained("my-checkpoint/")
    tokenizer = AutoTokenizer.from_pretrained("my-checkpoint/")
"""

from __future__ import annotations
from typing import Optional
from transformers import PretrainedConfig


# ---------------------------------------------------------------------------
# 1. Config  —  subclass PretrainedConfig for full HF serialisation
# ---------------------------------------------------------------------------

class BertOrdinalConfig(PretrainedConfig):
    """
    Configuration for :class:`BertOrdinal`.

    Because this inherits from :class:`~transformers.PretrainedConfig`,
    ``save_pretrained`` writes a ``config.json`` that ``from_pretrained``
    can read back without any extra bookkeeping.

    Parameters
    ----------
    bert_model_name : str
        HuggingFace model name or local path for the BERT backbone.
    num_classes : int
        Number of ordinal classes K.  Creates K-1 learnable thresholds.
    hidden_dim : int
        Inner dimension of the projection head.
    dropout : float
        Dropout probability inside the projection head.
    freeze_bert : bool
        Freeze backbone weights at construction time.
    loss_reduction : str
        ``'mean'`` or ``'sum'``.
    """

    # Tells HF which class owns this config (written into config.json).
    model_type = "bert_ordinal"
    problem_type = "single_label_classification"

    def __init__(
        self,
        bert_model_name: str  = "allenai/scibert_scivocab_uncased",
        num_classes:     int  = 3,
        hidden_dim:      int  = 256,
        dropout:         float = 0.1,
        freeze_bert:     bool = True,
        loss_reduction:  str  = "mean",
        # hidden_size is set automatically by the model after loading BERT;
        # it is stored here so from_pretrained can rebuild the head offline.
        hidden_size:     Optional[int] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.bert_model_name = bert_model_name
        self.num_classes     = num_classes
        self.hidden_dim      = hidden_dim
        self.dropout         = dropout
        self.freeze_bert     = freeze_bert
        self.loss_reduction  = loss_reduction
        self.hidden_size     = hidden_size   # filled in by BertOrdinal.__init__

        self.auto_map = {
            "AutoConfig": "configuration_bert_ordinal.BertOrdinalConfig",
            "AutoModel": "modeling_bert_ordinal.BertOrdinal",
            "AutoModelForSequenceClassification": "modeling_bert_ordinal.BertOrdinal",
        }