yeomtong
/

srl_bert_model

+from model import PredicateAwareSRL
+from transformers import AutoTokenizer
+import spacy
+from spacy import cli as spacy_cli
+import torch
+@torch.no_grad()
+def predict_srl_single(
+    model, tokenizer, words, predicate_word_idx, id2label, device="cuda"
+):
+    # words must come from spaCy (one token per element)
+    # e.g., words = [t.text for t in nlp(sentence)]
+    model.eval()
+    # --- sentence subwords ---
+    sent_enc = tokenizer(
+        words,
+        is_split_into_words=True,
+        add_special_tokens=False,
+        return_attention_mask=False,
+        return_token_type_ids=False,
+    )
+    # Require a *fast* tokenizer to get word_ids
+    try:
+        sent_word_ids = sent_enc.word_ids()
+    except Exception:
+        raise ValueError(
+            "Tokenizer must be a *fast* tokenizer to use .word_ids(). "
+            "Initialize with use_fast=True."
+        )
+    sent_wp_ids = sent_enc["input_ids"]
+    # HF may return [[...]] vs [...] depending on version—normalize to flat list
+    if isinstance(sent_wp_ids[0], list):
+        sent_wp_ids = sent_wp_ids[0]
+    # first-subword index per word (in full sequence after we add [CLS])
+    first_pos_by_wid = {}
+    for pos, wid in enumerate(sent_word_ids):
+        if wid is not None and wid not in first_pos_by_wid:
+            first_pos_by_wid[wid] = pos + 1  # +1 to account for [CLS] we add below
+    n_words = len(words)
+    word_first_wp_fullidx = torch.tensor(
+        [first_pos_by_wid[i] for i in range(n_words)], dtype=torch.long
+    ).unsqueeze(0)
+    # --- predicate subwords (surface form only) ---
+    pred_enc = tokenizer(
+        [words[predicate_word_idx]],
+        is_split_into_words=True,
+        add_special_tokens=False,
+        return_attention_mask=False,
+        return_token_type_ids=False,
+    )
+    pred_wp_ids = pred_enc["input_ids"]
+    if isinstance(pred_wp_ids[0], list):
+        pred_wp_ids = pred_wp_ids[0]
+    # --- assemble full input: [CLS] sent [SEP] pred [SEP] ---
+    cls_id, sep_id = tokenizer.cls_token_id, tokenizer.sep_token_id
+    input_ids = [cls_id] + sent_wp_ids + [sep_id] + pred_wp_ids + [sep_id]
+    token_type_ids = [0] * (1 + len(sent_wp_ids) + 1) + [1] * (len(pred_wp_ids) + 1)
+    attention_mask = [1] * len(input_ids)
+    # --- tensors ---
+    device = torch.device(device if torch.cuda.is_available() and "cuda" in device else "cpu")
+    input_ids      = torch.tensor(input_ids).unsqueeze(0).to(device)
+    token_type_ids = torch.tensor(token_type_ids).unsqueeze(0).to(device)
+    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)
+    sent_len       = torch.tensor([n_words], dtype=torch.long).to(device)
+    sentence_mask  = torch.ones(1, n_words, dtype=torch.bool).to(device)
+    pred_word_idx  = torch.tensor([predicate_word_idx], dtype=torch.long).to(device)
+    indicator      = torch.zeros(1, n_words, dtype=torch.long).to(device)
+    indicator[0, predicate_word_idx] = 1
+    word_first_wp_fullidx = word_first_wp_fullidx.to(device)
+    # --- forward ---
+    logits, _ = model(
+        input_ids=input_ids,
+        token_type_ids=token_type_ids,
+        attention_mask=attention_mask,
+        word_first_wp_fullidx=word_first_wp_fullidx,
+        sentence_mask=sentence_mask,
+        sent_lens=sent_len,
+        pred_word_idx=pred_word_idx,
+        indicator=indicator,
+        labels=None,
+    )
+    pred_ids = logits.argmax(-1).squeeze(0).tolist()
+    tags = [id2label[i] for i in pred_ids]
+    return tags, logits.squeeze(0).cpu()  # [L_word, num_labels]
+def spacy_verb_indices(nlp, sentence: str):
+    """
+    Returns the indices (0..n-1) of tokens that are verbs/auxiliaries by spaCy POS.
+    """
+    doc = nlp(sentence)
+    return [i for i, t in enumerate(doc) if t.pos_ in ("VERB", "AUX") or t.tag_.startswith("VB")]
+def words_and_spans_spacy(sentence: str, nlp):
+    """
+    Returns:
+      words : list[str]            (spaCy tokens)
+      spans : list[(start,end)]    (char offsets per word)
+    """
+    doc = nlp(sentence)
+    words = [t.text for t in doc]
+    spans = [(t.idx, t.idx + len(t.text)) for t in doc]
+    return words, spans
+def bio_to_spans(tags):
+    spans = []
+    i = 0
+    while i < len(tags):
+        t = tags[i]
+        if t == "O" or t.endswith("-V"):
+            i += 1
+            continue
+        if t.startswith("B-"):
+            role = t[2:]
+            j = i + 1
+            while j < len(tags) and tags[j] == f"I-{role}":
+                j += 1
+            spans.append((role, i, j-1))
+            i = j
+        else:
+            i += 1
+    return spans
+@torch.no_grad()
+def predict_srl_allennlp_like_spacy(
+    model, tokenizer, nlp, sentence, id2label,
+    device="cuda",
+    prob_threshold=0.50,
+    top_k=None,
+    pick_best_if_none=True
+):
+    model.eval()
+    # -- spaCy-only tokenization --
+    words, spans = words_and_spans_spacy(sentence, nlp)
+    n = len(words)
+    if n == 0:
+        return [], []
+    # verb candidates from spaCy
+    verb_idxs = spacy_verb_indices(nlp, sentence)
+    if not verb_idxs:
+        return words, []   # no predicates found
+    # find predicate label id
+    pred_ids = [i for i, t in id2label.items() if t in ("B-V", "V")]
+    if not pred_ids:
+        raise ValueError("Label set has no predicate tag ('B-V' or 'V').")
+    b_v_id = pred_ids[0]
+    keep = verb_idxs
+    if top_k is not None and len(keep) > top_k:
+        keep = keep[:top_k]
+    results = []
+    for p in keep:
+        # IMPORTANT: predict_srl_single should encode using
+        # tokenizer(..., is_split_into_words=True) on `words`
+        tags, logits = predict_srl_single(
+            model, tokenizer, words, p, id2label, device=device
+        )
+        p_bv = torch.softmax(logits[p], dim=-1)[b_v_id].item()
+        spans_out = bio_to_spans(tags)
+        results.append({
+            "predicate_index": p,
+            "predicate": words[p],
+            "p_bv": p_bv,
+            "tags": tags,
+            "spans": spans_out
+        })
+    # optional thresholding
+    if prob_threshold is not None:
+        passed = [r for r in results if r["p_bv"] >= prob_threshold]
+        if not passed and pick_best_if_none and results:
+            passed = [max(results, key=lambda r: r["p_bv"])]
+        results = passed
+    return words, results
+def main_predictor(model_path, bert_name, sentence, spacy_model="en_core_web_md"):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt = torch.load(model_path, map_location=device)
+    hp = ckpt.get("hparams", ckpt.get("hyper_parameters", {}))
+    model = PredicateAwareSRL(**hp).to(device)
+    state = ckpt.get("state_dict", ckpt.get("model_state_dict", ckpt))
+    model.load_state_dict(state)
+    model.eval()
+    label2id = ckpt["label2id"] if "label2id" in ckpt else {v:k for k,v in ckpt["id2label"].items()}
+    id2label = {v:k for k,v in label2id.items()}
+    tokenizer = AutoTokenizer.from_pretrained(bert_name, use_fast=True)
+    try:
+        nlp = spacy.load(spacy_model)
+    except OSError:
+        spacy_cli.download(spacy_model)   # <— no local `spacy` binding
+        nlp = spacy.load(spacy_model)
+    words, frames = predict_srl_allennlp_like_spacy(
+        model, tokenizer, nlp, sentence, id2label,
+        device=device, prob_threshold=0.40, top_k=None, pick_best_if_none=True
+    )
+    return words, frames

visualizer.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from predictor import main_predictor
+import re
+import itertools
+def bio_brackets_to_spans(text: str) -> str:
+    """
+    Collapse BIO bracket chunks into non-BIO spans.
+    Example:
+        [B-ARG2: of] [I-ARG2: the] [I-ARG2: orchards] → [ARG2: of the orchards]
+        [B-V: take] → [V: take]
+    Non-bracket text (spaces, punctuation, quotes) is preserved.
+    """
+    BIO_RE = re.compile(r"\[(B|I)-([A-Za-z0-9\-]+):\s*([^\]]+?)\]")
+    out = []
+    i = 0
+    matches = list(BIO_RE.finditer(text))
+    m = 0
+    cursor = 0
+    while m < len(matches):
+        # plain text before next BIO chunk
+        out.append(text[cursor:matches[m].start()])
+        # start a run
+        prefix, role, tok = matches[m].groups()
+        tokens = [tok]
+        cursor = matches[m].end()
+        m += 1
+        # absorb subsequent I-<same role> chunks if only whitespace between
+        while m < len(matches):
+            between = text[cursor:matches[m].start()]
+            p2, role2, tok2 = matches[m].groups()
+            if role2 == role and p2 == "I" and between.strip() == "":
+                tokens.append(tok2)
+                cursor = matches[m].end()
+                m += 1
+            else:
+                break
+        # output merged span (drop B-/I-), keep V as just "V"
+        out.append(f"[{role}: {' '.join(tokens)}]")
+    # trailing text
+    out.append(text[cursor:])
+    return "".join(out)
+def create_description(words, tag_list):
+    desc_list = []
+    for tok, tag in zip(words, tag_list):
+        if tag != 'O' :
+            desc_list.append("["+tag+": "+tok+"]")
+        else:
+            desc_list.append(tok)
+    desc_str_temp = (' ').join(desc_list)
+    return bio_brackets_to_spans(desc_str_temp)
+def create_dict(words, frames):
+    final_dict = {}
+    verb = []
+    for f in frames:
+        temp_dict = {}
+        temp_dict['verb'] = f['predicate']
+        temp_dict['description'] = create_description(words, f['tags'])
+        temp_dict['tags'] = f['tags']
+        verb.append(temp_dict)
+    final_dict['verbs'] = verb
+    final_dict['words'] = words
+    return final_dict
+def print_srl_frames_pretty(words, frames, show_grid=True, color=False):
+    """
+    Pretty-print SRL frames.
+    - Description: Token+Labels
+    - Frames: Predicate/Roles
+    - show_grid: also print a token/label grid aligned by column
+    - color: add simple ANSI colors per role (terminal only)
+    """
+    # tiny colorizer (terminal-only); safe no-op if color=False
+    ANSI = {
+        "ARG0": "\033[38;5;34m", "ARG1": "\033[38;5;33m", "ARG2": "\033[38;5;129m",
+        "ARG3": "\033[38;5;172m", "ARG4": "\033[38;5;166m", "ARGM": "\033[38;5;244m",
+        "V": "\033[1;37m", "RESET": "\033[0m"
+    }
+    def paint(txt, role):
+        if not color: return txt
+        key = "ARGM" if role.startswith("ARGM") else ("V" if role.endswith("V") or role=="V" else role)
+        return f"{ANSI.get(key, '')}{txt}{ANSI['RESET']}"
+    def spans_from_bio(tags):
+        spans = []
+        i = 0
+        while i < len(tags):
+            t = tags[i]
+            if t == "O":
+                i += 1; continue
+            if t.endswith("-V"):  # you can include/exclude the V span as you like
+                spans.append(("V", i, i))
+                i += 1; continue
+            if t.startswith("B-"):
+                role = t[2:]
+                j = i + 1
+                while j < len(tags) and tags[j] == f"I-{role}":
+                    j += 1
+                spans.append((role, i, j-1))
+                i = j
+            else:
+                i += 1
+        return spans
+    # words = [word.text for word in words]
+    print("Sentence:", " ".join(words))
+    if not frames:
+        print("  (no predicates detected)")
+        return
+    for k, fr in enumerate(frames, 1):
+        tags = fr["tags"]
+        spans = fr.get("spans") or spans_from_bio(tags)
+        pred_idx = fr["predicate_index"]
+        pred = fr["predicate"]
+        p_bv = fr.get("p_bv", None)
+        print("\n" + "—"*60)
+        # head = f"Frame {k} — predicate: {pred} (idx {pred_idx})"
+        # if p_bv is not None:
+        #     head += f"   P(B-V)={p_bv:.3f}"
+        # print(head)
+        print(create_description(words, tags))
+        # Aggregate phrases per role for a clean summary
+        by_role = {}
+        for role, s, e in spans:
+            phrase = " ".join(words[s:e+1])
+            by_role.setdefault(role, []).append(phrase)
+        # Put V first, then core args, then ARGM*
+        order = (
+            (("V",),),
+            tuple((r,) for r in ["ARG0","ARG1","ARG2","ARG3","ARG4"]),
+            (tuple(sorted([r for r in by_role if r.startswith("ARGM")])),)
+        )
+        ordered_roles = []
+        for group in order:
+            for r in itertools.chain.from_iterable(group):
+                if r in by_role: ordered_roles.append(r)
+        # add any leftover roles
+        # for r in sorted(by_role):
+        #     if r not in ordered_roles: ordered_roles.append(r)
+        # print("Predicate:")
+        # print(f"  {r:<8}: {pred}")
+        # print("Roles:")
+        # for r in ordered_roles:
+        #     joined = "; ".join(by_role[r])
+        #     print(f"  {r:<8}: {paint(joined, r)}")
+        if show_grid:
+            # token/tag grid aligned by column width
+            colw = [max(len(w), len(t)) for w, t in zip(words, tags)]
+            tok_row = " ".join(w.ljust(colw[i]) for i, w in enumerate(words))
+            tag_row = " ".join((t if t != "O" else ".").ljust(colw[i]) for i, t in enumerate(tags))
+            print("\nTOKEN:", tok_row)
+            print("LABEL:", tag_row)
+def prediction(model_path, bert_name, sentence):
+    words, frames = main_predictor(model_path, bert_name, sentence)
+    print_srl_frames_pretty(words, frames, show_grid=True, color=False)
+def prediction_formatted(model_path, bert_name, sentence):
+    words, frames = main_predictor(model_path, bert_name, sentence)
+    temp_result = create_dict(words, frames)
+    return temp_result