import os, json, torch
from torch import nn
from transformers import AutoModel, AutoTokenizer

def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts

class SummaryEvaluatorModule(nn.Module):
    def __init__(self, base_model_name_or_path, head_config_path, regressor_path):
        super().__init__()
        self.model = AutoModel.from_pretrained(base_model_name_or_path)
        with open(head_config_path, "r", encoding="utf-8") as f:
            head_cfg = json.load(f)
        hidden = head_cfg["in_features"]
        self.regressor = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Linear(256, 3)
        )
        self.regressor.load_state_dict(torch.load(regressor_path, map_location="cpu"))
        self.regressor.eval()

    @torch.no_grad()
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = mean_pool(out.last_hidden_state, attention_mask)
        return self.regressor(pooled)

def from_pretrained_custom(repo_dir_or_id, device=None):
    # Cho phép dùng cả local folder hoặc repo_id trên Hub
    base = repo_dir_or_id
    tok = AutoTokenizer.from_pretrained(base, use_fast=True)
    mdl = SummaryEvaluatorModule(
        base_model_name_or_path=base,
        head_config_path=os.path.join(base, "head_config.json"),
        regressor_path=os.path.join(base, "regressor.pt"),
    )
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    mdl.to(device).eval()
    return mdl, tok, device