from model import PredicateAwareSRL
from transformers import AutoTokenizer
import spacy
from spacy import cli as spacy_cli 
import torch

@torch.no_grad()
def predict_srl_single(
    model, tokenizer, words, predicate_word_idx, id2label, device="cuda"
):
    # words must come from spaCy (one token per element)
    # e.g., words = [t.text for t in nlp(sentence)]
    model.eval()

    # --- sentence subwords ---
    sent_enc = tokenizer(
        words,
        is_split_into_words=True,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )

    # Require a *fast* tokenizer to get word_ids
    try:
        sent_word_ids = sent_enc.word_ids()
    except Exception:
        raise ValueError(
            "Tokenizer must be a *fast* tokenizer to use .word_ids(). "
            "Initialize with use_fast=True."
        )

    sent_wp_ids = sent_enc["input_ids"]
    # HF may return [[...]] vs [...] depending on version—normalize to flat list
    if isinstance(sent_wp_ids[0], list):
        sent_wp_ids = sent_wp_ids[0]

    # first-subword index per word (in full sequence after we add [CLS])
    first_pos_by_wid = {}
    for pos, wid in enumerate(sent_word_ids):
        if wid is not None and wid not in first_pos_by_wid:
            first_pos_by_wid[wid] = pos + 1  # +1 to account for [CLS] we add below

    n_words = len(words)
    word_first_wp_fullidx = torch.tensor(
        [first_pos_by_wid[i] for i in range(n_words)], dtype=torch.long
    ).unsqueeze(0)

    # --- predicate subwords (surface form only) ---
    pred_enc = tokenizer(
        [words[predicate_word_idx]],
        is_split_into_words=True,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )
    pred_wp_ids = pred_enc["input_ids"]
    if isinstance(pred_wp_ids[0], list):
        pred_wp_ids = pred_wp_ids[0]

    # --- assemble full input: [CLS] sent [SEP] pred [SEP] ---
    cls_id, sep_id = tokenizer.cls_token_id, tokenizer.sep_token_id
    input_ids = [cls_id] + sent_wp_ids + [sep_id] + pred_wp_ids + [sep_id]
    token_type_ids = [0] * (1 + len(sent_wp_ids) + 1) + [1] * (len(pred_wp_ids) + 1)
    attention_mask = [1] * len(input_ids)

    # --- tensors ---
    device = torch.device(device if torch.cuda.is_available() and "cuda" in device else "cpu")
    input_ids      = torch.tensor(input_ids).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(token_type_ids).unsqueeze(0).to(device)
    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)

    sent_len       = torch.tensor([n_words], dtype=torch.long).to(device)
    sentence_mask  = torch.ones(1, n_words, dtype=torch.bool).to(device)
    pred_word_idx  = torch.tensor([predicate_word_idx], dtype=torch.long).to(device)
    indicator      = torch.zeros(1, n_words, dtype=torch.long).to(device)
    indicator[0, predicate_word_idx] = 1
    word_first_wp_fullidx = word_first_wp_fullidx.to(device)

    # --- forward ---
    logits, _ = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        word_first_wp_fullidx=word_first_wp_fullidx,
        sentence_mask=sentence_mask,
        sent_lens=sent_len,
        pred_word_idx=pred_word_idx,
        indicator=indicator,
        labels=None,
    )

    pred_ids = logits.argmax(-1).squeeze(0).tolist()
    tags = [id2label[i] for i in pred_ids]
    return tags, logits.squeeze(0).cpu()  # [L_word, num_labels]


def spacy_verb_indices(nlp, sentence: str):
    """
    Returns the indices (0..n-1) of tokens that are verbs/auxiliaries by spaCy POS.
    """
    doc = nlp(sentence)
    return [i for i, t in enumerate(doc) if t.pos_ in ("VERB", "AUX") or t.tag_.startswith("VB")]


def words_and_spans_spacy(sentence: str, nlp):
    """
    Returns:
      words : list[str]            (spaCy tokens)
      spans : list[(start,end)]    (char offsets per word)
    """
    doc = nlp(sentence)
    words = [t.text for t in doc]
    spans = [(t.idx, t.idx + len(t.text)) for t in doc]
    return words, spans

def bio_to_spans(tags):
    spans = []
    i = 0
    while i < len(tags):
        t = tags[i]
        if t == "O" or t.endswith("-V"):
            i += 1
            continue
        if t.startswith("B-"):
            role = t[2:]
            j = i + 1
            while j < len(tags) and tags[j] == f"I-{role}":
                j += 1
            spans.append((role, i, j-1))
            i = j
        else:
            i += 1
    return spans


@torch.no_grad()
def predict_srl_allennlp_like_spacy(
    model, tokenizer, nlp, sentence, id2label,
    device="cuda",
    prob_threshold=0.50,
    top_k=None,
    pick_best_if_none=True
):
    model.eval()

    # -- spaCy-only tokenization --
    words, spans = words_and_spans_spacy(sentence, nlp)
    n = len(words)
    if n == 0:
        return [], []

    # verb candidates from spaCy
    verb_idxs = spacy_verb_indices(nlp, sentence)
    if not verb_idxs:
        return words, []   # no predicates found

    # find predicate label id
    pred_ids = [i for i, t in id2label.items() if t in ("B-V", "V")]
    if not pred_ids:
        raise ValueError("Label set has no predicate tag ('B-V' or 'V').")
    b_v_id = pred_ids[0]

    keep = verb_idxs
    if top_k is not None and len(keep) > top_k:
        keep = keep[:top_k]

    results = []
    for p in keep:
        # IMPORTANT: predict_srl_single should encode using
        # tokenizer(..., is_split_into_words=True) on `words`
        tags, logits = predict_srl_single(
            model, tokenizer, words, p, id2label, device=device
        )
        p_bv = torch.softmax(logits[p], dim=-1)[b_v_id].item()
        spans_out = bio_to_spans(tags)
        results.append({
            "predicate_index": p,
            "predicate": words[p],
            "p_bv": p_bv,
            "tags": tags,
            "spans": spans_out
        })

    # optional thresholding
    if prob_threshold is not None:
        passed = [r for r in results if r["p_bv"] >= prob_threshold]
        if not passed and pick_best_if_none and results:
            passed = [max(results, key=lambda r: r["p_bv"])]
        results = passed

    return words, results

def normalize_whitespace(s: str) -> str:
    if s is None:
        return ""
    # strip leading/trailing spaces (incl. non-breaking etc.)
    s = s.replace("\u00A0", " ").replace("\u2009", " ").strip()
    return s

def main_predictor(model_path, bert_name, sentence, spacy_model="en_core_web_md"):
    sentence = normalize_whitespace(sentence)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    ckpt = torch.load(model_path, map_location=device)
    hp = ckpt.get("hparams", ckpt.get("hyper_parameters", {}))

    model = PredicateAwareSRL(**hp).to(device)
    state = ckpt.get("state_dict", ckpt.get("model_state_dict", ckpt))
    model.load_state_dict(state)
    model.eval()

    label2id = ckpt["label2id"] if "label2id" in ckpt else {v:k for k,v in ckpt["id2label"].items()}
    id2label = {v:k for k,v in label2id.items()}

    tokenizer = AutoTokenizer.from_pretrained(bert_name, use_fast=True)

    try:
        nlp = spacy.load(spacy_model)
    except OSError:
        spacy_cli.download(spacy_model)   # <— no local `spacy` binding
        nlp = spacy.load(spacy_model)

    words, frames = predict_srl_allennlp_like_spacy(
        model, tokenizer, nlp, sentence, id2label,
        device=device, prob_threshold=0.40, top_k=None, pick_best_if_none=True
    )
    return words, frames