Upload 5 files

Browse files

Files changed (5) hide show

data_prep.py +266 -0
model.py +140 -0
predicator.py +141 -0
testing.py +80 -0
training.py +182 -0

data_prep.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from typing import List, Dict, Optional
+from torch.utils.data import Dataset
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+from sklearn.metrics import f1_score
+import json
+#Create data instance, words: tokenized word list, predicte_word_idx: index for predicte, labels: Semantic roles
+!@dataclass
+class SRLSample():
+    def __init__(self, words: List[str], predicate_word_idx: int, labels: List[str], predicate_form: Optional[str] = None):
+        self.words = words
+        self.predicate_word_idx = predicate_word_idx
+        self.labels = labels
+        self.predicate_form = predicate_form
+#To Leah: SRL Sample is object for each dataset so we need another code for each instance(words, predicate_word_idx, labels) into list of SRLSample objects
+def create_srl_samples(data_path):
+  samples = []
+  with open(data_path, 'r', encoding='utf-8') as f:
+    for line in f:
+      data = json.loads(line)
+      samples.append(SRLSample(**data))
+  return samples
+#Example
+#if __name__ == '__main__'
+# data_class_train = create_srl_samples('/content/drive/MyDrive/Dissertation/srl_synthetic_100.jsonl')
+# data_class_dev = create_srl_samples('/content/drive/MyDrive/Dissertation/srl_synthetic_dev_10.jsonl')
+# data_class_test = create_srl_samples('/content/drive/MyDrive/Dissertation/srl_synthetic_test_10.jsonl')
+class SRLDataset(Dataset):
+    """
+    Expects samples at WORD-level. We build BERT inputs as:
+      [CLS] <sentence (wordpiece)> [SEP] <predicate (wordpiece)> [SEP]
+    We keep:
+      - wordpiece indices for each word's FIRST subtoken (to pool BERT to word level)
+      - sentence lengths
+      - predicate's WORD index within the sentence (for gp from BiLSTM outputs)
+    """
+    def __init__(self, samples: List[SRLSample], tokenizer: AutoTokenizer, label2id: Dict[str, int], max_length: int = 256, debug_print= False):
+        self.samples = samples
+        self.tokenizer = tokenizer
+        self.label2id = label2id
+        self.id2label = {v: k for k, v in label2id.items()}
+        self.max_length = max_length
+        self.debug_print = debug_print
+    def __len__(self):
+        return len(self.samples)
+    def _tokenize_sentence(self, words: List[str]):
+        # Tokenize sentence as split words to preserve word boundaries
+        enc_sent = self.tokenizer(
+            words,
+            is_split_into_words=True,
+            add_special_tokens=False,
+            return_attention_mask=False,
+            return_token_type_ids=False
+        )
+        return enc_sent  # dict with 'input_ids'
+    def _tokenize_predicate(self, form: str):
+        enc_pred = self.tokenizer(
+            form,
+            add_special_tokens=False,
+            return_attention_mask=False,
+            return_token_type_ids=False
+        )
+        return enc_pred
+    def __getitem__(self, idx):
+        instance = self.samples[idx]
+        words = instance.words
+        n_words = len(words)
+        assert 0 <= instance.predicate_word_idx < n_words, "Bad predicate index."
+        pred_form = instance.predicate_form if instance.predicate_form is not None else words[instance.predicate_word_idx]
+        # Tokenize sentence and predicate separately (Text -> numeric value)
+        enc_sent = self._tokenize_sentence(words)
+        enc_pred = self._tokenize_predicate(pred_form)
+        # print("This is enc_sent {}, this is enc_prec {} \n".format(enc_sent, enc_pred))
+        sent_wp_ids = enc_sent["input_ids"]                       # list[int]
+        pred_wp_ids = enc_pred["input_ids"]                       # list[int]
+        # Build final input ids and token type ids Here we added SEP for predicates create new input ids
+        # segment A (0): [CLS] sentence [SEP]
+        # segment B (1): predicate [SEP]
+        # [CLS] sentence [SEP] predicte [SEP]
+        # [CLS] sentence [SEP] ARG0_token [SEP] ARG1_token [SEP] ARG2_token [SEP] -> Model for emotion, formality and politeness
+        input_ids = [self.tokenizer.cls_token_id] + sent_wp_ids + [self.tokenizer.sep_token_id] \
+                    + pred_wp_ids + [self.tokenizer.sep_token_id]
+        # token_type_ids: 0 for [CLS] + sentence + [SEP], 1 for predicate + [SEP]
+        ttids = [0] * (1 + len(sent_wp_ids) + 1) + [1] * (len(pred_wp_ids) + 1)
+        # Build mapping: each WORD -> index of its FIRST wordpiece inside the FULL sequence
+        # We iterate tokenizer.word_ids() by re-tokenizing with special tokens for alignment
+        # Simpler: reconstruct with pre-known structure:
+        # [CLS] at 0; sentence starts at 1; we need mapping from word index to its FIRST wordpiece offset in `sent_wp_ids`.
+        # We'll align by re-tokenizing sentence with is_split_into_words and reading the mapping.
+        # HuggingFace trick: get word_ids requires encoding with add_special_tokens=True, so let's do that quickly:
+        tmp = self.tokenizer(words, is_split_into_words=True, return_offsets_mapping=False)
+        word_ids = tmp.word_ids()
+        # print("This is tmp {}, word_ids {}\n".format(tmp, word_ids))
+        # Now, tmp.input_ids == [CLS] + sent_wp + [SEP]; positions:
+        #   0: CLS, 1..1+len(sent_wp_ids)-1: sentence, 1+len(sent_wp_ids): SEP
+        # We need FIRST position per word_id in this tmp encoding.
+        first_wp_pos_in_full = []
+        seen = set()
+        for pos, wid in enumerate(word_ids):
+            if wid is None:
+                continue
+            if wid not in seen:
+                seen.add(wid)
+                first_wp_pos_in_full.append(pos)  # pos in tmp sequence
+        # Sort by wid order to align [0..n_words-1]
+        # word_ids may produce first_wp_pos_in_full in increasing pos order, but ensure length correctness:
+        # print("This is first_wp_posin_full {}\n".format(first_wp_pos_in_full))
+        first_wp_pos_in_full_sorted = [None] * n_words
+        # Build first index per wid:
+        first_pos_by_wid = {}
+        for pos, wid in enumerate(word_ids):
+            if wid is not None and wid not in first_pos_by_wid:
+                first_pos_by_wid[wid] = pos
+        for wid in range(n_words):
+            first_wp_pos_in_full_sorted[wid] = first_pos_by_wid[wid]
+        #first_wp_pos_in_full_sorted is the indices without special tokens (e.g., CLS, SEP)
+        # Convert those positions (which refer to tmp with specials) to positions in our final input (with extra predicate segment).
+        # In tmp: [CLS] sentence_wp [SEP]
+        # In final: [CLS] sentence_wp [SEP] predicate_wp [SEP]
+        # So for any position 'pos' inside tmp, it points to the SAME index in final, since the prefix is identical up to first [SEP].
+        word_first_wp_fullidx = first_wp_pos_in_full_sorted  # list[int] length = n_words
+        # Labels to IDs
+        label_ids = [self.label2id[lbl] for lbl in instance.labels]
+        assert len(label_ids) == n_words
+        # Predicate indicator at word level (0/1)
+        indicator = [0] * n_words
+        indicator[instance.predicate_word_idx] = 1
+        # [0,0,0,0,0] -> [0,0,1,0,0]
+        # Attention mask for the full input
+        attention_mask = [1] * len(input_ids)
+        # Truncate if needed (rare for normal SRL sentences but keep safe)
+        if len(input_ids) > self.max_length:
+            # We only truncate the predicate side if absolutely necessary; for simplicity, just clip tail.
+            input_ids = input_ids[:self.max_length]
+            ttids = ttids[:self.max_length]
+            attention_mask = attention_mask[:self.max_length]
+            # NOTE: word_first_wp_fullidx could reference beyond max_length in pathological cases.
+            max_pos = self.max_length - 1
+            word_first_wp_fullidx = [min(p, max_pos) for p in word_first_wp_fullidx]
+        if self.debug_print:
+            toks_debug = self.tokenizer.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
+            print("[DeBug idx = {}]".format(idx)+" ".join(toks_debug))
+        return {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "token_type_ids": torch.tensor(ttids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "word_first_wp_fullidx": torch.tensor(word_first_wp_fullidx, dtype=torch.long),  # [n_words]
+            "labels": torch.tensor(label_ids, dtype=torch.long),                               # [n_words]
+            "indicator": torch.tensor(indicator, dtype=torch.long),                            # [n_words]
+            "sent_len": torch.tensor(len(words), dtype=torch.long),
+            "pred_word_idx": torch.tensor(instance.predicate_word_idx, dtype=torch.long)
+        }
+def srl_collate(batch: List[Dict], pad_token_id: int, pad_label_id: int = -100):
+    """
+    Pads full BERT inputs to same length; also pads word-level tensors to max sentence length.
+    Returns tensors ready for the model.
+    """
+    B = len(batch)
+    # Full sequence padding
+    max_L = max(item["input_ids"].size(0) for item in batch)
+    # print("This is B {}, max_L {}".format(B,max_L))
+    #make tensor B rows and Max_L columns
+    input_ids = torch.full((B, max_L), pad_token_id, dtype=torch.long)
+    token_type_ids = torch.zeros((B, max_L), dtype=torch.long)
+    attention_mask = torch.zeros((B, max_L), dtype=torch.long)
+    # Word-level padding
+    max_n = max(int(item["sent_len"]) for item in batch)
+    word_first_wp_fullidx = torch.full((B, max_n), -1, dtype=torch.long)
+    labels = torch.full((B, max_n), pad_label_id, dtype=torch.long)
+    indicator = torch.zeros((B, max_n), dtype=torch.long)
+    sent_lens = torch.zeros((B,), dtype=torch.long)
+    pred_word_idx = torch.zeros((B,), dtype=torch.long)
+    sentence_mask = torch.zeros((B, max_n), dtype=torch.bool)
+    for i, item in enumerate(batch):
+        # print("This is item {}".format(item))
+        L = item["input_ids"].size(0)
+        input_ids[i, :L] = item["input_ids"]
+        token_type_ids[i, :L] = item["token_type_ids"]
+        attention_mask[i, :L] = item["attention_mask"]
+        n = int(item["sent_len"])
+        word_first_wp_fullidx[i, :n] = item["word_first_wp_fullidx"]
+        labels[i, :n] = item["labels"]
+        indicator[i, :n] = item["indicator"]
+        sent_lens[i] = n
+        pred_word_idx[i] = item["pred_word_idx"]
+        sentence_mask[i, :n] = True
+    return {
+        "input_ids": input_ids,
+        "token_type_ids": token_type_ids,
+        "attention_mask": attention_mask,
+        "word_first_wp_fullidx": word_first_wp_fullidx,  # [B, max_n] (full-seq positions; -1 for pad)
+        "sentence_mask": sentence_mask,                  # [B, max_n] (bool mask for valid words)
+        "labels": labels,                                # [B, max_n] (pad_label_id for pad)
+        "indicator": indicator,                          # [B, max_n] 0/1
+        "sent_lens": sent_lens,                          # [B]
+        "pred_word_idx": pred_word_idx                   # [B]
+    }
+def data_processing_for_loader(train_dev_test: List[SRLSample], train_sample: List[SRLSample], dev_sample: List[SRLSample], test_sample: List[SRLSample], tokenizer):
+  '''
+  train_dev_test is an appended list of Train/Dev/Test SRLSamples
+  train_sample is a list of SRLSample
+  dev_sample is a list of SRLSample
+  test_sample is a list of SRLSample
+  '''
+  label2id = {}
+  for s in train_dev_test:
+    for l in s.labels:
+        label2id.setdefault(l, len(label2id))
+  id2label = {v: k for k, v in label2id.items()}
+  #train before loader
+  train_bf_loader = SRLDataset(train_sample, tokenizer, label2id, max_length = 128, debug_print = False)
+  dev_bf_loader = SRLDataset(dev_sample, tokenizer, label2id, max_length = 128, debug_print = False)
+  test_bf_loader = SRLDataset(test_sample, tokenizer, label2id, max_length = 128, debug_print = False)
+  return train_bf_loader, dev_bf_loader, test_bf_loader, label2id, id2label

model.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+from transformers import AutoModel, AutoConfig
+class PredicateAwareSRL(nn.Module):
+    def __init__(self,
+                 bert_name: str,
+                 num_labels: int,
+                 use_indicator: bool = True,
+                 indicator_dim: int = 10,          # CHANGED: 10-dim predicate indicator
+                 lstm_hidden: int = 768,           # CHANGED: BiLSTM hidden size = 768 (bidirectional)
+                 mlp_hidden: int = 300,            # CHANGED: MLP hidden size = 300
+                 dropout: float = 0.1,
+                 use_distance: bool = True,        # NEW: enable relative position (distance) embeddings
+                 pos_dim: int = 50,                # NEW: size of position embedding (random init, trainable)
+                 max_distance: int = 128):         # NEW: clamp |i - p| to this range for bucketing
+        super().__init__()
+        self.config = AutoConfig.from_pretrained(bert_name)
+        self.bert = AutoModel.from_pretrained(bert_name)
+        self.use_indicator = use_indicator
+        # --- Input dim to BiLSTM = BERT_dim + (indicator_dim) + (pos_dim)
+        bert_dim = self.config.hidden_size
+        in_dim = bert_dim + (indicator_dim if use_indicator else 0)
+        # Two rows which indicate 0 not predicate 1 is predicate, so need to 2 embedding (rows)
+        # num_embeddings (int) – size of the dictionary of embeddings
+        # embedding_dim (int) – the size of each embedding vector
+        if use_indicator:
+            self.indicator_emb = nn.Embedding(2, indicator_dim)  # 0/1 → 10-dim (random init, trainable)  # CHANGED
+        self.use_distance = use_distance                         # NEW
+        self.max_distance = max_distance                          # NEW
+        if use_distance:
+            # Distance buckets: [-max_distance .. +max_distance] → indices [0 .. 2*max_distance]
+            self.pos_emb = nn.Embedding(2 * max_distance + 1, pos_dim)  # NEW (random init, trainable)
+            in_dim += pos_dim                                         # NEW
+        # BiLSTM (bidirectional): total output dim = lstm_hidden
+        self.bilstm = nn.LSTM(
+            input_size=in_dim,
+            hidden_size=lstm_hidden // 2,  # bi → half per direction
+            num_layers=1,
+            dropout=0.0,
+            bidirectional=True,
+            batch_first=True
+        )
+        self.dropout = nn.Dropout(dropout)
+        # Classifier: concat(g_i, gp) so input dim = 2 * lstm_hidden
+        self.classifier = nn.Sequential(
+            nn.Linear(lstm_hidden * 2, mlp_hidden),   # CHANGED (mlp_hidden=300)
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(mlp_hidden, num_labels)
+        )
+        self.pad_label_id = -100
+    def forward(self,
+                input_ids: torch.Tensor,           # [B, L]
+                token_type_ids: torch.Tensor,      # [B, L]
+                attention_mask: torch.Tensor,      # [B, L]
+                word_first_wp_fullidx: torch.Tensor,  # [B, max_n] (positions in full seq; -1 for pad)
+                sentence_mask: torch.Tensor,       # [B, max_n] (bool)
+                sent_lens: torch.Tensor,           # [B]
+                pred_word_idx: torch.Tensor,       # [B]
+                indicator: torch.Tensor | None = None,  # [B, max_n] 0/1
+                labels: torch.Tensor | None = None):    # [B, max_n]
+        B, L = input_ids.size()
+        device = input_ids.device
+        # ---- BERT encoder
+        bert_out = self.bert(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask
+        )
+        H = bert_out.last_hidden_state  # [B, L, D]
+        # ---- Subword → word pooling (first subword)
+        # Gather sentence word-level representations by taking FIRST subtoken hidden per word
+        # Prepare indices (replace -1 with 0 to avoid gather OOB; we'll mask later)
+        # This process is required to feed word level to predict BIO and role per word
+        #.clone is for deep copy won't change original data
+        gather_idx = word_first_wp_fullidx.clone()
+        gather_idx[gather_idx < 0] = 0
+        gather_idx = gather_idx.unsqueeze(-1).expand(-1, -1, H.size(-1))  # [B, max_n, D]
+        H_words = torch.gather(H, dim=1, index=gather_idx)                 # [B, max_n, D]
+        H_words = H_words * sentence_mask.unsqueeze(-1)                    # zero out pads
+        # ---- Concatenate predicate indicator (0/1 → emb)
+        # word_first_wp_fullidx: [1, 2, 3, -1, -1]
+        # gather_idx after clamp: [1, 2, 3, 0, 0]   # 0 points to [CLS], just a placeholder
+        # H_words = gather(H, ...)                  # grabs vectors at positions 1,2,3,0,0
+        # sentence_mask:        [1, 1, 1, 0, 0]
+        # H_words *= mask →     [vec1, vec2, vec3, 0, 0]   # padded slots zeroed out
+        X = H_words
+        if self.use_indicator and indicator is not None:
+            ind_emb = self.indicator_emb(indicator.clamp(0, 1))            # [B, max_n, 10]  # CHANGED
+            X = torch.cat([X, ind_emb], dim=-1)
+        # ---- NEW: Relative position (distance-to-predicate) embeddings
+        if self.use_distance:
+            # positions: 0..max_n-1 per sentence
+            max_n = X.size(1)
+            positions = torch.arange(max_n, device=device).unsqueeze(0).expand(B, -1)  # [B, max_n]
+            rel = positions - pred_word_idx.unsqueeze(1)                               # [B, max_n], can be <0
+            rel = rel.clamp(-self.max_distance, self.max_distance) + self.max_distance # shift to [0 .. 2*max_distance]
+            pos_feats = self.pos_emb(rel)                                              # [B, max_n, pos_dim]  # NEW
+            X = torch.cat([X, pos_feats], dim=-1)                                      # [B, max_n, in_dim]  # NEW
+        # ---- BiLSTM (packed)
+        lengths = sent_lens.detach().cpu()
+        packed = pack_padded_sequence(X, lengths=lengths, batch_first=True, enforce_sorted=False)
+        G_packed, _ = self.bilstm(packed)
+        G, _ = pad_packed_sequence(G_packed, batch_first=True)      # [B, max_n, lstm_hidden]
+        G = self.dropout(G)
+        # ---- Predicate hidden (word-level) and concat to every position
+        batch_idx = torch.arange(B, device=device)
+        gp = G[batch_idx, pred_word_idx.clamp(min=0), :]            # [B, lstm_hidden]
+        gp_expanded = gp.unsqueeze(1).expand(-1, G.size(1), -1)     # [B, max_n, lstm_hidden]
+        logits = self.classifier(torch.cat([G, gp_expanded], dim=-1))  # [B, max_n, num_labels]
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=self.pad_label_id)
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        return logits, loss

predicator.py ADDED Viewed

	@@ -0,0 +1,141 @@

+## This is testing
+import torch
+@torch.no_grad()
+def predict_srl_single(model, tokenizer, words, predicate_word_idx, id2label, device="cuda"):
+    # tokenize sentence (no specials)
+    sent_enc = tokenizer(
+        words, is_split_into_words=True, add_special_tokens=False,
+        return_attention_mask=False, return_token_type_ids=False
+    )
+    sent_wp_ids = sent_enc["input_ids"]
+    sent_word_ids = sent_enc.word_ids()
+    # first-subword position per word in the FULL sequence: [CLS] sent [SEP] pred [SEP]
+    first_pos_by_wid = {}
+    for pos, wid in enumerate(sent_word_ids):
+        if wid is not None and wid not in first_pos_by_wid:
+            first_pos_by_wid[wid] = pos + 1  # +1 for [CLS]
+    n_words = len(words)
+    word_first_wp_fullidx = torch.tensor(
+        [first_pos_by_wid[i] for i in range(n_words)], dtype=torch.long
+    ).unsqueeze(0)
+    # predicate segment = surface form of the predicate word
+    pred_enc = tokenizer(
+        [words[predicate_word_idx]], is_split_into_words=True, add_special_tokens=False,
+        return_attention_mask=False, return_token_type_ids=False
+    )
+    pred_wp_ids = pred_enc["input_ids"]
+    # assemble full input
+    cls_id, sep_id = tokenizer.cls_token_id, tokenizer.sep_token_id
+    input_ids = [cls_id] + sent_wp_ids + [sep_id] + pred_wp_ids + [sep_id]
+    token_type_ids = [0] * (1 + len(sent_wp_ids) + 1) + [1] * (len(pred_wp_ids) + 1)
+    attention_mask = [1] * len(input_ids)
+    # tensors
+    input_ids     = torch.tensor(input_ids).unsqueeze(0).to(device)
+    token_type_ids= torch.tensor(token_type_ids).unsqueeze(0).to(device)
+    attention_mask= torch.tensor(attention_mask).unsqueeze(0).to(device)
+    sent_len      = torch.tensor([n_words], dtype=torch.long).to(device)
+    sentence_mask = torch.ones(1, n_words, dtype=torch.bool).to(device)
+    pred_word_idx = torch.tensor([predicate_word_idx], dtype=torch.long).to(device)
+    indicator     = torch.zeros(1, n_words, dtype=torch.long).to(device)
+    indicator[0, predicate_word_idx] = 1
+    word_first_wp_fullidx = word_first_wp_fullidx.to(device)
+    # forward
+    logits, _ = model(
+        input_ids=input_ids,
+        token_type_ids=token_type_ids,
+        attention_mask=attention_mask,
+        word_first_wp_fullidx=word_first_wp_fullidx,
+        sentence_mask=sentence_mask,
+        sent_lens=sent_len,
+        pred_word_idx=pred_word_idx,
+        indicator=indicator,
+        labels=None
+    )
+    pred_ids = logits.argmax(-1).squeeze(0).tolist()
+    tags = [id2label[i] for i in pred_ids]
+    return tags, logits.squeeze(0).cpu()  # [L_word, num_labels]
+def bio_to_spans(tags):
+    spans = []
+    i = 0
+    while i < len(tags):
+        t = tags[i]
+        if t == "O" or t.endswith("-V"):
+            i += 1
+            continue
+        if t.startswith("B-"):
+            role = t[2:]
+            j = i + 1
+            while j < len(tags) and tags[j] == f"I-{role}":
+                j += 1
+            spans.append((role, i, j-1))
+            i = j
+        else:
+            i += 1
+    return spans
+@torch.no_grad()
+def predict_srl_all_predicates(model, tokenizer, sentence, id2label, device="cuda", prob_threshold=0.50):
+    words = sentence.split()
+    # find the numeric id for "B-V"
+    b_v_id = None
+    for k, v in id2label.items():
+        if v == "B-V":
+            b_v_id = k
+            break
+    if b_v_id is None:
+        raise ValueError("Label set has no 'B-V' tag.")
+    results = []
+    for p in range(len(words)):
+        tags, logits = predict_srl_single(model, tokenizer, words, p, id2label, device=device)
+        # check predicate decision at position p
+        pred_id_at_p = logits.argmax(-1)[p].item()
+        keep = (pred_id_at_p == b_v_id)
+        # optional confidence gate
+        if prob_threshold is not None:
+            probs = torch.softmax(logits[p], dim=-1)
+            keep = keep and (probs[b_v_id].item() >= prob_threshold)
+        if keep:
+            spans = bio_to_spans(tags)
+            results.append({
+                "predicate_index": p,
+                "predicate": words[p],
+                "tags": tags,
+                "spans": spans
+            })
+    return words, results
+# words, preds = predict_srl_all_predicates(model, tokenizer, sentence, id2label, device=device)
+def predicator_srl(sentence):
+    words, preds = predict_srl_all_predicates(model, tokenizer, sentence, id2label, device=device)
+    return words, preds
+if __name__ == "__main__":
+    sentence = "Hojeong decide to go to the school"
+    words, preds = predicator_srl(sentence)
+    print(words)
+    for r in preds:
+        print(f"Predicate: {r['predicate']} (idx {r['predicate_index']})")
+        print("Tags:", list(zip(words, r["tags"])))
+        print("Spans:", r["spans"])  # (ROLE, start, end) indices over words
+        print("-" * 60)

testing.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from SRL_model import SRL_BERT_model
+from collections import Counter
+import torch
+def bio_to_spans(tags):
+    spans = []
+    i = 0
+    while i < len(tags):
+        t = tags[i]
+        if t == "O" or t.endswith("-V"):
+            i += 1; continue
+        if t.startswith("B-"):
+            role = t[2:]; j = i + 1
+            while j < len(tags) and tags[j] == f"I-{role}":
+                j += 1
+            spans.append((role, i, j-1))
+            i = j
+        else:
+            i += 1
+    return spans
+@torch.no_grad()
+def eval_span_f1(model, dataloader, id2label, device="cuda"):
+    model.eval()
+    tp = fp = fn = 0
+    for batch in dataloader:
+        gold = batch["labels"]               # [B, Lw]
+        mask = (gold != -100)
+        batch = {k:(v.to(device) if torch.is_tensor(v) else v) for k,v in batch.items()}
+        logits, _ = model(**batch)
+        pred = logits.argmax(-1).cpu()       # [B, Lw]
+        print(pred)
+        for g_seq, p_seq, m in zip(gold, pred, mask):
+            gl = [id2label[int(i)] for i in g_seq[m].tolist()]
+            pl = [id2label[int(i)] for i in p_seq[m].tolist()]
+            G = Counter(bio_to_spans(gl))
+            P = Counter(bio_to_spans(pl))
+            # micro counts
+            common = G & P
+            tp += sum(common.values())
+            fp += sum(P.values()) - sum(common.values())
+            fn += sum(G.values()) - sum(common.values())
+    prec = tp / (tp + fp + 1e-12)
+    rec  = tp / (tp + fn + 1e-12)
+    f1   = 2 * prec * rec / (prec + rec + 1e-12)
+    return prec, rec, f1
+if __name__ =="__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt_path = "/blue/bonniejdorr/youms/SRL-Aware_Model/model/best_srl_Sep_29.ckpt"   # <-- change if needed
+    ckpt = torch.load(ckpt_path, map_location=device)
+    hp = ckpt["hparams"]
+    model = SRL_BERT_model.PredicateAwareSRL(**hp).to(device)
+    model.load_state_dict(ckpt["state_dict"])
+    model.eval()
+    label2id = ckpt["label2id"]
+    id2label = {v: k for k, v in label2id.items()}
+    h = ckpt.get("hparams", {
+        "bert_name": "bert-base-cased",
+        "num_labels": len(label2id),
+        "use_indicator": True,
+        "use_distance": True,
+        "indicator_dim": 10,
+        "lstm_hidden": 768,
+        "mlp_hidden": 300,
+        "pos_dim": 50,
+        "max_distance": 128,
+        "dropout": 0.1,
+    })
+    #test_loader from SRL_BERT_model
+    prec, rec, span_f1 = eval_span_f1(model, test_loader, id2label, device=device)
+    print(f"[TEST-SPAN] P={prec:.3f} R={rec:.3f} F1={span_f1:.3f}")

training.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from SRL_MODEL import data_prep, SRL_BERT_model
+import torch
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+from sklearn.metrics import f1_score
+import pickle
+def save_pkl(tgt_list, svg_path):
+    with open(svg_path, "wb") as f:
+        pickle.dump(tgt_list, f)
+def load_pkl(path) :
+    with open(path, "rb") as f:
+        data = pickle.load(f)
+    return data
+def train_one_epoch(
+    model,
+    dataloader,
+    optimizer,
+    device="cuda",
+    scheduler=None,
+    grad_accum_steps=1,
+    amp=True,
+    max_grad_norm=1.0,
+):
+    model.train()
+    total_loss, n_steps = 0.0, 0
+    use_amp = amp and torch.cuda.is_available()
+    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
+    optimizer.zero_grad(set_to_none=True)
+    for step, batch in enumerate(dataloader, 1):
+        batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
+        with torch.cuda.amp.autocast(enabled=use_amp, dtype=torch.float16):
+            _, loss = model(**batch)  # model must return (logits, loss)
+        total_loss += float(loss.detach().item())
+        n_steps += 1
+        loss = loss / grad_accum_steps  # for accumulation
+        if use_amp:
+            scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        if step % grad_accum_steps == 0:
+            if use_amp:
+                scaler.unscale_(optimizer)
+            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
+            if use_amp:
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            if scheduler is not None:
+                scheduler.step()
+    return total_loss / max(1, n_steps)
+#This is Validation
+@torch.no_grad()
+def eval_loss_and_token_f1(model, dataloader, id2label=None, device="cuda", average="micro"):
+    model.eval()
+    total_loss, n_batches = 0.0, 0
+    all_preds, all_golds = [], []
+    for batch in dataloader:
+        gold = batch["labels"]                  # keep on CPU for masking
+        mask = (gold != -100)
+        batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
+        logits, loss = model(**batch)           # loss computed once here
+        total_loss += float(loss.item()); n_batches += 1
+        preds = logits.argmax(-1).cpu()
+        all_preds.extend(preds[mask].tolist())
+        all_golds.extend(gold[mask].tolist())
+    f1 = f1_score(all_golds, all_preds, average=average)
+    return total_loss / max(1, n_batches), f1
+if __name__ =='__main__':
+  bert_name = "bert-base-cased"
+  tokenizer = AutoTokenizer.from_pretrained(bert_name)
+  device = "cuda" if torch.cuda.is_available() else "cpu"
+  # tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+  #data_class_train/dev/test from data_prep
+  train_dev_test_data = data_class_train + data_class_dev + data_class_test
+  train_bf_loader, dev_bf_loader,test_bf_loader, label2id, id2label = data_prep.data_processing_for_loader(train_dev_test_data, data_class_train, data_class_dev, data_class_test, tokenizer)
+  pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+  collate = lambda b: data_prep.srl_collate(b, pad_token_id=pad_token_id, pad_label_id=-100)
+  train_loader = data_prep.DataLoader(train_bf_loader, batch_size=16, shuffle=True, collate_fn=collate)
+  dev_loader = data_prep.DataLoader(dev_bf_loader, batch_size=16, shuffle=False, collate_fn=collate)
+  test_loader = data_prep.DataLoader(test_bf_loader, batch_size=16, shuffle=False, collate_fn=collate)
+  # bert_name = "bert-base-cased"
+  # tokenizer = AutoTokenizer.from_pretrained(bert_name)
+  # device = "cuda" if torch.cuda.is_available() else "cpu"
+  model = SRL_BERT_model.PredicateAwareSRL(
+      bert_name=bert_name,
+      num_labels=len(label2id),
+      use_indicator=True,
+      use_distance =True,
+      indicator_dim= 10,
+      lstm_hidden=768,
+      mlp_hidden=300,
+      pos_dim= 50,
+      max_distance = 128,
+      dropout=0.1
+  ).to(device)
+  # Optimizer (you may want to use AdamW with weight decay and a scheduler)
+  num_epochs = 12
+  grad_accum_steps = 1
+  optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+  # # Train a couple of epochs (on toy data this is just to check shapes run)
+  # for epoch in range(3):
+  #     tr_loss = train_one_epoch(model, train_loader, optimizer, device=device)
+  #     f1 = evaluate_token_f1(model, dev_loader, id2label=id2label, device=device)
+  #     print(f"Epoch {epoch+1} | loss={tr_loss:.4f} | token-F1={f1:.4f}")
+  total_steps = len(train_loader) * num_epochs // max(1, grad_accum_steps)
+  warmup_steps = int(0.1 * total_steps)
+  scheduler = get_linear_schedule_with_warmup(
+      optimizer,
+      num_warmup_steps=warmup_steps,
+      num_training_steps=total_steps
+  )
+  history = {"epoch": [], "train_loss": [], "dev_loss": [], "dev_f1": []}
+  best_dev, best_path = -1.0, "best_srl.ckpt"
+  for epoch in range(num_epochs):
+      tr_loss = train_one_epoch(
+          model, train_loader, optimizer, device=device,
+          scheduler=scheduler, grad_accum_steps=grad_accum_steps, amp=True, max_grad_norm=1.0
+      )
+      dev_loss, dev_f1 = eval_loss_and_token_f1(model, dev_loader, id2label, device=device)
+      history["epoch"].append(epoch + 1)
+      history["train_loss"].append(tr_loss)
+      history["dev_loss"].append(dev_loss)
+      history["dev_f1"].append(dev_f1)
+      print(f"Epoch {epoch+1}: train_loss={tr_loss:.4f}  dev_loss={dev_loss:.4f}  dev_F1={dev_f1:.4f}")
+      if dev_f1 > best_dev:
+          best_dev = dev_f1
+          torch.save({"model_state": model.state_dict(), "label2id": label2id}, best_path)
+          print("  ↳ new best dev; saved.")
+  save_pkl(history, #save_path_for_loss)
+  # best_dev, best_path = -1.0, "best_srl.ckpt"
+  # for epoch in range(num_epochs):
+  #     tr_loss = train_one_epoch(model, train_loader, optimizer, device=device)
+  #     dev_loss, dev_f1 = eval_loss_and_token_f1(model, dev_loader, id2label, device=device)
+  #     print(f"Epoch {epoch+1}: train_loss={tr_loss:.4f}  dev_loss={dev_loss:.4f}  dev_F1={dev_f1:.4f}")
+  #     if dev_f1 > best_dev:
+  #         best_dev = dev_f1
+  #         torch.save({"model_state": model.state_dict(), "label2id": label2id}, best_path)
+  #         print("  ↳ new best dev; saved.")