"""AspectBERT model: DistilBERT backbone + custom classification head.

Architecture:
  - distilbert-base-uncased backbone (6 transformer layers)
  - First 4 transformer layers (and embeddings) frozen, last 2 fine-tuned
  - Classification head: Linear(768->256) -> GELU -> Dropout(0.2) -> Linear(256->3)

The [CLS] token's last hidden state is fed to the classification head to
produce 3-way (negative/neutral/positive) sentiment logits per
"{review_text} aspect: {aspect_name}" input.
"""

import os
import sys

import torch
import torch.nn as nn
from transformers import DistilBertModel

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from constants import MAX_LENGTH, MODEL_NAME, NUM_LABELS  # noqa: E402


class AspectBERT(nn.Module):
    def __init__(self, model_name=MODEL_NAME, num_labels=NUM_LABELS, freeze_layers=4):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.distilbert.config.dim  # 768 for distilbert-base

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_labels),
        )

        self._freeze_layers(freeze_layers)

    def _freeze_layers(self, n_frozen):
        """Freeze embeddings and the first `n_frozen` transformer layers."""
        for param in self.distilbert.embeddings.parameters():
            param.requires_grad = False

        for i, layer in enumerate(self.distilbert.transformer.layer):
            if i < n_frozen:
                for param in layer.parameters():
                    param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]  # [batch, hidden_size]
        logits = self.classifier(cls_token)  # [batch, num_labels]
        return logits

    def trainable_parameter_summary(self):
        total = sum(p.numel() for p in self.parameters())
        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return {"total_params": total, "trainable_params": trainable,
                "trainable_pct": 100.0 * trainable / total}


if __name__ == "__main__":
    from transformers import DistilBertTokenizerFast

    print("Building AspectBERT model...")
    model = AspectBERT()
    model.eval()

    summary = model.trainable_parameter_summary()
    print(f"Total params:     {summary['total_params']:,}")
    print(f"Trainable params: {summary['trainable_params']:,} "
          f"({summary['trainable_pct']:.2f}%)")

    print("\nFrozen vs trainable transformer layers:")
    for i, layer in enumerate(model.distilbert.transformer.layer):
        any_trainable = any(p.requires_grad for p in layer.parameters())
        print(f"  layer {i}: {'trainable' if any_trainable else 'frozen'}")

    print("\nRunning a forward pass with dummy input...")
    tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
    text = "The battery life is amazing and lasts all day. aspect: battery"
    enc = tokenizer(text, truncation=True, padding="max_length",
                     max_length=MAX_LENGTH, return_tensors="pt")

    with torch.no_grad():
        logits = model(enc["input_ids"], enc["attention_mask"])

    probs = torch.softmax(logits, dim=-1)
    print(f"Input: {text!r}")
    print(f"Logits shape: {tuple(logits.shape)}")
    print(f"Logits: {logits.tolist()}")
    print(f"Probabilities (negative/neutral/positive): {probs.tolist()}")
    print("\nForward pass OK.")