Spaces:

SinaLab
/

wojood-api

Running

App Files Files Community

naghamghanim commited on Dec 27, 2025

Commit

f316449

verified ·

1 Parent(s): 8895e63

Upload 37 files

Browse files

Files changed (37) hide show

Nested/__init__.py +0 -0
Nested/__pycache__/__init__.cpython-311.pyc +0 -0
Nested/bin/__init__.py +0 -0
Nested/bin/eval.py +87 -0
Nested/bin/infer.py +73 -0
Nested/bin/process.py +140 -0
Nested/bin/train.py +222 -0
Nested/data/__init__.py +0 -0
Nested/data/__pycache__/__init__.cpython-311.pyc +0 -0
Nested/data/__pycache__/datasets.cpython-311.pyc +0 -0
Nested/data/__pycache__/transforms.cpython-311.pyc +0 -0
Nested/data/datasets.py +150 -0
Nested/data/transforms.py +127 -0
Nested/nn/BaseModel.py +22 -0
Nested/nn/BertNestedTagger.py +34 -0
Nested/nn/BertSeqTagger.py +4 -1
Nested/nn/__init__.py +3 -0
Nested/nn/__pycache__/BaseModel.cpython-311.pyc +0 -0
Nested/nn/__pycache__/BertNestedTagger.cpython-311.pyc +0 -0
Nested/nn/__pycache__/BertSeqTagger.cpython-311.pyc +0 -0
Nested/nn/__pycache__/__init__.cpython-311.pyc +0 -0
Nested/trainers/BaseTrainer.py +117 -0
Nested/trainers/BertNestedTrainer.py +203 -0
Nested/trainers/BertTrainer.py +163 -0
Nested/trainers/__init__.py +3 -0
Nested/trainers/__pycache__/BaseTrainer.cpython-311.pyc +0 -0
Nested/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc +0 -0
Nested/trainers/__pycache__/BertTrainer.cpython-311.pyc +0 -0
Nested/trainers/__pycache__/__init__.cpython-311.pyc +0 -0
Nested/utils/__init__.py +0 -0
Nested/utils/__pycache__/__init__.cpython-311.pyc +0 -0
Nested/utils/__pycache__/data.cpython-311.pyc +0 -0
Nested/utils/__pycache__/helpers.cpython-311.pyc +0 -0
Nested/utils/__pycache__/metrics.cpython-311.pyc +0 -0
Nested/utils/data.py +112 -38
Nested/utils/helpers.py +117 -0
Nested/utils/metrics.py +69 -0

Nested/__init__.py ADDED Viewed

File without changes

Nested/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (149 Bytes). View file

Nested/bin/__init__.py ADDED Viewed

File without changes

Nested/bin/eval.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import logging
+import argparse
+from collections import namedtuple
+from Nested.utils.helpers import load_checkpoint, make_output_dirs, logging_config
+from Nested.utils.data import get_dataloaders, parse_conll_files
+from Nested.utils.metrics import compute_single_label_metrics, compute_nested_metrics
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Path to save results",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Model path",
+    )
+    parser.add_argument(
+        "--data_paths",
+        nargs="+",
+        type=str,
+        required=True,
+        help="Text or sequence to tag, this is in same format as training data with 'O' tag for all tokens",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size",
+    )
+    args = parser.parse_args()
+    return args
+def main(args):
+    # Create directory to save predictions
+    make_output_dirs(args.output_path, overwrite=True)
+    logging_config(log_file=os.path.join(args.output_path, "eval.log"))
+    # Load tagger
+    tagger, tag_vocab, train_config = load_checkpoint(args.model_path)
+    # Convert text to a tagger dataset and index the tokens in args.text
+    datasets, vocab = parse_conll_files(args.data_paths)
+    vocabs = namedtuple("Vocab", ["tags", "tokens"])
+    vocab = vocabs(tokens=vocab.tokens, tags=tag_vocab)
+    # From the datasets generate the dataloaders
+    dataloaders = get_dataloaders(
+        datasets, vocab,
+        train_config.data_config,
+        batch_size=args.batch_size,
+        shuffle=[False] * len(datasets)
+    )
+    # Evaluate the model on each dataloader
+    for dataloader, input_file in zip(dataloaders, args.data_paths):
+        filename = os.path.basename(input_file)
+        predictions_file = os.path.join(args.output_path, f"predictions_{filename}")
+        _, segments, _, _ = tagger.eval(dataloader)
+        tagger.segments_to_file(segments, predictions_file)
+        if "Nested" in train_config.trainer_config["fn"]:
+            compute_nested_metrics(segments, vocab.tags[1:])
+        else:
+            compute_single_label_metrics(segments)
+if __name__ == "__main__":
+    main(parse_args())

Nested/bin/infer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import logging
+import argparse
+from collections import namedtuple
+from Nested.utils.helpers import load_checkpoint
+from Nested.utils.data import get_dataloaders, text2segments
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Model path",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="Text or sequence to tag",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size",
+    )
+    args = parser.parse_args()
+    return args
+def main(args):
+    # Load tagger
+    tagger, tag_vocab, train_config = load_checkpoint(args.model_path)
+    # Convert text to a tagger dataset and index the tokens in args.text
+    dataset, token_vocab = text2segments(args.text)
+    vocabs = namedtuple("Vocab", ["tags", "tokens"])
+    vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
+    # From the datasets generate the dataloaders
+    dataloader = get_dataloaders(
+        (dataset,),
+        vocab,
+        train_config.data_config,
+        batch_size=args.batch_size,
+        shuffle=(False,),
+    )[0]
+    # Perform inference on the text and get back the tagged segments
+    segments = tagger.infer(dataloader)
+    # Print results
+    for segment in segments:
+        s = [
+            f"{token.text} ({'|'.join([t['tag'] for t in token.pred_tag])})"
+            for token in segment
+        ]
+        print(" ".join(s))
+if __name__ == "__main__":
+    main(parse_args())

Nested/bin/process.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import argparse
+import csv
+import logging
+import numpy as np
+from Nested.utils.helpers import logging_config
+from Nested.utils.data import conll_to_segments
+logger = logging.getLogger(__name__)
+def to_conll_format(input_files, output_path, multi_label=False):
+    """
+    Parse data files and convert them into CoNLL format
+    :param input_files: List[str] - list of filenames
+    :param output_path: str - output path
+    :param multi_label: boolean - True to process data with mutli-class/multi-label
+    :return:
+    """
+    for input_file in input_files:
+        tokens = list()
+        prev_sent_id = None
+        with open(input_file, "r") as fh:
+            r = csv.reader(fh, delimiter="\t", quotechar=" ")
+            next(r)
+            for row in r:
+                sent_id, token, labels = row[1], row[3], row[4].split()
+                valid_labels = sum([1 for l in labels if "-" in l or l == "O"]) == len(labels)
+                if not valid_labels:
+                    logging.warning("Invalid labels found %s", str(row))
+                    continue
+                if not labels:
+                    logging.warning("Token %s has no label", str(row))
+                    continue
+                if not token:
+                    logging.warning("Token %s is missing", str(row))
+                    continue
+                if len(token.split()) > 1:
+                    logging.warning("Token %s has multiple tokens", str(row))
+                    continue
+                if prev_sent_id is not None and sent_id != prev_sent_id:
+                    tokens.append([])
+                if multi_label:
+                    tokens.append([token] + labels)
+                else:
+                    tokens.append([token, labels[0]])
+                prev_sent_id = sent_id
+        num_segments = sum([1 for token in tokens if not token])
+        logging.info("Found %d segments and %d tokens in %s", num_segments + 1, len(tokens) - num_segments, input_file)
+        filename = os.path.basename(input_file)
+        output_file = os.path.join(output_path, filename)
+        with open(output_file, "w") as fh:
+            fh.write("\n".join(" ".join(token) for token in tokens))
+            logging.info("Output file %s", output_file)
+def train_dev_test_split(input_files, output_path, train_ratio, dev_ratio):
+    segments = list()
+    filenames = ["train.txt", "val.txt", "test.txt"]
+    for input_file in input_files:
+        segments += conll_to_segments(input_file)
+    n = len(segments)
+    np.random.shuffle(segments)
+    datasets = np.split(segments, [int(train_ratio*n), int((train_ratio+dev_ratio)*n)])
+    # write data to files
+    for i in range(len(datasets)):
+        filename = os.path.join(output_path, filenames[i])
+        with open(filename, "w") as fh:
+            text = "\n\n".join(["\n".join([f"{token.text} {' '.join(token.gold_tag)}" for token in segment]) for segment in datasets[i]])
+            fh.write(text)
+            logging.info("Output file %s", filename)
+def main(args):
+    if args.task == "to_conll_format":
+        to_conll_format(args.input_files, args.output_path, multi_label=args.multi_label)
+    if args.task == "train_dev_test_split":
+        train_dev_test_split(args.input_files, args.output_path, args.train_ratio, args.dev_ratio)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input_files",
+        type=str,
+        nargs="+",
+        required=True,
+        help="List of input files",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output path",
+    )
+    parser.add_argument(
+        "--train_ratio",
+        type=float,
+        required=False,
+        help="Training data ratio (percent of segments). Required with the task train_dev_test_split. "
+             "Files must in ConLL format",
+    )
+    parser.add_argument(
+        "--dev_ratio",
+        type=float,
+        required=False,
+        help="Dev/val data ratio (percent of segments). Required with the task train_dev_test_split. "
+             "Files must in ConLL format",
+    )
+    parser.add_argument(
+        "--task", required=True, choices=["to_conll_format", "train_dev_test_split"]
+    )
+    parser.add_argument(
+        "--multi_label", action='store_true'
+    )
+    args = parser.parse_args()
+    logging_config(os.path.join(args.output_path, "process.log"))
+    main(args)

Nested/bin/train.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import os
+import logging
+import json
+import argparse
+import torch.utils.tensorboard
+from torchvision import *
+import pickle
+from Nested.utils.data import get_dataloaders, parse_conll_files
+from Nested.utils.helpers import logging_config, load_object, make_output_dirs, set_seed
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output path",
+    )
+    parser.add_argument(
+        "--train_path",
+        type=str,
+        required=True,
+        help="Path to training data",
+    )
+    parser.add_argument(
+        "--val_path",
+        type=str,
+        required=True,
+        help="Path to training data",
+    )
+    parser.add_argument(
+        "--test_path",
+        type=str,
+        required=True,
+        help="Path to training data",
+    )
+    parser.add_argument(
+        "--bert_model",
+        type=str,
+        default="aubmindlab/bert-base-arabertv2",
+        help="BERT model",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=int,
+        nargs="+",
+        default=[0],
+        help="GPU IDs to train on",
+    )
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=10,
+        help="Log results every that many timesteps",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=0,
+        help="Dataloader number of workers",
+    )
+    parser.add_argument(
+        "--data_config",
+        type=json.loads,
+        default='{"fn": "Nested.data.datasets.DefaultDataset", "kwargs": {"max_seq_len": 512}}',
+        help="Dataset configurations",
+    )
+    parser.add_argument(
+        "--trainer_config",
+        type=json.loads,
+        default='{"fn": "Nested.trainers.BertTrainer", "kwargs": {"max_epochs": 50}}',
+        help="Trainer configurations",
+    )
+    parser.add_argument(
+        "--network_config",
+        type=json.loads,
+        default='{"fn": "Nested.nn.BertSeqTagger", "kwargs": '
+                '{"dropout": 0.1, "bert_model": "aubmindlab/bert-base-arabertv2"}}',
+        help="Network configurations",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=json.loads,
+        default='{"fn": "torch.optim.AdamW", "kwargs": {"lr": 0.0001}}',
+        help="Optimizer configurations",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=json.loads,
+        default='{"fn": "torch.optim.lr_scheduler.ExponentialLR", "kwargs": {"gamma": 1}}',
+        help="Learning rate scheduler configurations",
+    )
+    parser.add_argument(
+        "--loss",
+        type=json.loads,
+        default='{"fn": "torch.nn.CrossEntropyLoss", "kwargs": {}}',
+        help="Loss function configurations",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite output directory",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Seed for random initialization",
+    )
+    args = parser.parse_args()
+    return args
+def main(args):
+    make_output_dirs(
+        args.output_path,
+        subdirs=("tensorboard", "checkpoints"),
+        overwrite=args.overwrite,
+    )
+    # Set the seed for randomization
+    set_seed(args.seed)
+    logging_config(os.path.join(args.output_path, "train.log"))
+    summary_writer = torch.utils.tensorboard.SummaryWriter(
+        os.path.join(args.output_path, "tensorboard")
+    )
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(gpu) for gpu in args.gpus])
+    # Get the datasets and vocab for tags and tokens
+    datasets, vocab = parse_conll_files((args.train_path, args.val_path, args.test_path))
+    if "Nested" in args.network_config["fn"]:
+        args.network_config["kwargs"]["num_labels"] = [len(v) for v in vocab.tags[1:]]
+    else:
+        args.network_config["kwargs"]["num_labels"] = len(vocab.tags[0])
+    args.data_config["kwargs"]["bert_model"] = args.network_config["kwargs"]["bert_model"]
+    # Save tag vocab to desk
+    with open(os.path.join(args.output_path, "tag_vocab.pkl"), "wb") as fh:
+        pickle.dump(vocab.tags, fh)
+    # Write config to file
+    args_file = os.path.join(args.output_path, "args.json")
+    with open(args_file, "w") as fh:
+        logger.info("Writing config to %s", args_file)
+        json.dump(args.__dict__, fh, indent=4)
+    # From the datasets generate the dataloaders
+    train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
+        datasets, vocab, args.data_config, args.batch_size, args.num_workers
+    )
+    model = load_object(args.network_config["fn"], args.network_config["kwargs"])
+    model = torch.nn.DataParallel(model, device_ids=range(len(args.gpus)))
+    if torch.cuda.is_available():
+        model = model.cuda()
+    args.optimizer["kwargs"]["params"] = model.parameters()
+    optimizer = load_object(args.optimizer["fn"], args.optimizer["kwargs"])
+    args.lr_scheduler["kwargs"]["optimizer"] = optimizer
+    if "num_training_steps" in args.lr_scheduler["kwargs"]:
+        args.lr_scheduler["kwargs"]["num_training_steps"] = args.max_epochs * len(
+            train_dataloader
+        )
+    scheduler = load_object(args.lr_scheduler["fn"], args.lr_scheduler["kwargs"])
+    loss = load_object(args.loss["fn"], args.loss["kwargs"])
+    args.trainer_config["kwargs"].update({
+        "model": model,
+        "optimizer": optimizer,
+        "scheduler": scheduler,
+        "loss": loss,
+        "train_dataloader": train_dataloader,
+        "val_dataloader": val_dataloader,
+        "test_dataloader": test_dataloader,
+        "log_interval": args.log_interval,
+        "summary_writer": summary_writer,
+        "output_path": args.output_path
+    })
+    trainer = load_object(args.trainer_config["fn"], args.trainer_config["kwargs"])
+    trainer.train()
+    return
+if __name__ == "__main__":
+    main(parse_args())

Nested/data/__init__.py ADDED Viewed

File without changes

Nested/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (154 Bytes). View file

Nested/data/__pycache__/datasets.cpython-311.pyc ADDED Viewed

Binary file (7.31 kB). View file

Nested/data/__pycache__/transforms.cpython-311.pyc ADDED Viewed

Binary file (9.26 kB). View file

Nested/data/datasets.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import logging
+import torch
+from torch.utils.data import Dataset
+from torch.nn.utils.rnn import pad_sequence
+from Nested.data.transforms import (
+    BertSeqTransform,
+    NestedTagsTransform
+)
+logger = logging.getLogger(__name__)
+class Token:
+    def __init__(self, text=None, pred_tag=None, gold_tag=None):
+        """
+        Token object to hold token attributes
+        :param text: str
+        :param pred_tag: str
+        :param gold_tag: str
+        """
+        self.text = text
+        self.gold_tag = gold_tag
+        self.pred_tag = pred_tag
+        self.subwords = None
+    @property
+    def subwords(self):
+        return self._subwords
+    @subwords.setter
+    def subwords(self, value):
+        self._subwords = value
+    def __str__(self):
+        """
+        Token text representation
+        :return: str
+        """
+        gold_tags = "|".join(self.gold_tag)
+        if self.pred_tag:
+            pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
+        else:
+            pred_tags = ""
+        if self.gold_tag:
+            r = f"{self.text}\t{gold_tags}\t{pred_tags}"
+        else:
+            r = f"{self.text}\t{pred_tags}"
+        return r
+class DefaultDataset(Dataset):
+    def __init__(
+        self,
+        examples=None,
+        vocab=None,
+        bert_model="aubmindlab/bert-base-arabertv2",
+        max_seq_len=512,
+    ):
+        """
+        The dataset that used to transform the segments into training data
+        :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
+                         You can get generate examples from -- Nested.data.dataset.parse_conll_files
+        :param vocab: vocab object containing indexed tags and tokens
+        :param bert_model: str - BERT model
+        :param: int - maximum sequence length
+        """
+        self.transform = BertSeqTransform(bert_model, vocab, max_seq_len=max_seq_len)
+        self.examples = examples
+        self.vocab = vocab
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, item):
+        subwords, tags, tokens, valid_len = self.transform(self.examples[item])
+        return subwords, tags, tokens, valid_len
+    def collate_fn(self, batch):
+        """
+        Collate function that is called when the batch is called by the trainer
+        :param batch: Dataloader batch
+        :return: Same output as the __getitem__ function
+        """
+        subwords, tags, tokens, valid_len = zip(*batch)
+        # Pad sequences in this batch
+        # subwords and tokens are padded with zeros
+        # tags are padding with the index of the O tag
+        subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
+        tags = pad_sequence(
+            tags, batch_first=True, padding_value=self.vocab.tags[0].get_stoi()["O"]
+        )
+        return subwords, tags, tokens, valid_len
+class NestedTagsDataset(Dataset):
+    def __init__(
+        self,
+        examples=None,
+        vocab=None,
+        bert_model="aubmindlab/bert-base-arabertv2",
+        max_seq_len=512,
+    ):
+        """
+        The dataset that used to transform the segments into training data
+        :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
+                         You can get generate examples from -- Nested.data.dataset.parse_conll_files
+        :param vocab: vocab object containing indexed tags and tokens
+        :param bert_model: str - BERT model
+        :param: int - maximum sequence length
+        """
+        self.transform = NestedTagsTransform(
+            bert_model, vocab, max_seq_len=max_seq_len
+        )
+        self.examples = examples
+        self.vocab = vocab
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, item):
+        subwords, tags, tokens, masks, valid_len = self.transform(self.examples[item])
+        return subwords, tags, tokens, masks, valid_len
+    def collate_fn(self, batch):
+        """
+        Collate function that is called when the batch is called by the trainer
+        :param batch: Dataloader batch
+        :return: Same output as the __getitem__ function
+        """
+        subwords, tags, tokens, masks, valid_len = zip(*batch)
+        # Pad sequences in this batch
+        # subwords and tokens are padded with zeros
+        # tags are padding with the index of the O tag
+        subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
+        masks = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), 0)(mask)
+                 for tag, mask in zip(tags, masks)]
+        masks = torch.cat(masks)
+        # Pad the tags, do the padding for each tag type
+        tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
+                for tag, vocab in zip(tags, self.vocab.tags[1:])]
+        tags = torch.cat(tags)
+        return subwords, tags, tokens, masks, valid_len

Nested/data/transforms.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+from transformers import BertTokenizer
+from functools import partial
+import logging
+import re
+import itertools
+import Nested
+logger = logging.getLogger(__name__)
+class BertSeqTransform:
+    def __init__(self, bert_model, vocab, max_seq_len=512):
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
+        self.encoder = partial(
+            self.tokenizer.encode,
+            max_length=max_seq_len,
+            truncation=True,
+        )
+        self.max_seq_len = max_seq_len
+        self.vocab = vocab
+    def __call__(self, segment):
+        subwords, tags, tokens = list(), list(), list()
+        unk_token = Nested.data.datasets.Token(text="UNK")
+        for token in segment:
+            # Sometimes the tokenizer fails to encode the word and return no input_ids, in that case, we use
+            # the input_id for [UNK]
+            token_subwords = self.encoder(token.text)[1:-1] or self.encoder("[UNK]")[1:-1]
+            subwords += token_subwords
+            tags += [self.vocab.tags[0].get_stoi()[token.gold_tag[0]]] + [self.vocab.tags[0].get_stoi()["O"]] * (len(token_subwords) - 1)
+            tokens += [token] + [unk_token] * (len(token_subwords) - 1)
+        # Truncate to max_seq_len
+        if len(subwords) > self.max_seq_len - 2:
+            text = " ".join([t.text for t in tokens if t.text != "UNK"])
+            logger.info("Truncating the sequence %s to %d", text, self.max_seq_len - 2)
+            subwords = subwords[:self.max_seq_len - 2]
+            tags = tags[:self.max_seq_len - 2]
+            tokens = tokens[:self.max_seq_len - 2]
+        subwords.insert(0, self.tokenizer.cls_token_id)
+        subwords.append(self.tokenizer.sep_token_id)
+        tags.insert(0, self.vocab.tags[0].get_stoi()["O"])
+        tags.append(self.vocab.tags[0].get_stoi()["O"])
+        tokens.insert(0, unk_token)
+        tokens.append(unk_token)
+        return torch.LongTensor(subwords), torch.LongTensor(tags), tokens, len(tokens)
+class NestedTagsTransform:
+    def __init__(self, bert_model, vocab, max_seq_len=512):
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
+        self.encoder = partial(
+            self.tokenizer.encode,
+            max_length=max_seq_len,
+            truncation=True,
+        )
+        self.max_seq_len = max_seq_len
+        self.vocab = vocab
+    def __call__(self, segment):
+        tags, tokens, subwords = list(), list(), list()
+        unk_token = Nested.data.datasets.Token(text="UNK")
+        # Encode each token and get its subwords and IDs
+        for token in segment:
+            # Sometimes the tokenizer fails to encode the word and return no input_ids, in that case, we use
+            # the input_id for [UNK]
+            token.subwords = self.encoder(token.text)[1:-1] or self.encoder("[UNK]")[1:-1]
+            subwords += token.subwords
+            tokens += [token] + [unk_token] * (len(token.subwords) - 1)
+        # Construct the labels for each tag type
+        # The sequence will have a list of tags for each type
+        # The final tags for a sequence is a matrix NUM_TAG_TYPES x SEQ_LEN
+        # Example:
+        #   [
+        #       [O,     O,     B-PERS, I-PERS, O, O, O]
+        #       [B-ORG, I-ORG, O,      O,      O, O, O]
+        #       [O,     O,     O,      O,      O, O, B-GPE]
+        #   ]
+        for vocab in self.vocab.tags[1:]:
+            vocab_tags = "|".join(["^" + t + "$" for t in vocab.get_itos() if "-" in t])
+            r = re.compile(vocab_tags)
+            # This is really messy
+            # For a given token we find a matching tag_name, BUT we might find
+            # multiple matches (i.e. a token can be labeled B-ORG and I-ORG) in this
+            # case we get only the first tag as we do not have overlapping of same type
+            single_type_tags = [[(list(filter(r.match, token.gold_tag))
+                                or ["O"])[0]] + ["O"] * (len(token.subwords) - 1)
+                                for token in segment]
+            single_type_tags = list(itertools.chain(*single_type_tags))
+            tags.append([vocab.get_stoi()[tag] for tag in single_type_tags])
+        # Truncate to max_seq_len
+        if len(subwords) > self.max_seq_len - 2:
+            text = " ".join([t.text for t in tokens if t.text != "UNK"])
+            logger.info("Truncating the sequence %s to %d", text, self.max_seq_len - 2)
+            subwords = subwords[:self.max_seq_len - 2]
+            tags = [t[:self.max_seq_len - 2] for t in tags]
+            tokens = tokens[:self.max_seq_len - 2]
+        # Add dummy token at the start end of sequence
+        tokens.insert(0, unk_token)
+        tokens.append(unk_token)
+        # Add CLS and SEP at start end of subwords
+        subwords.insert(0, self.tokenizer.cls_token_id)
+        subwords.append(self.tokenizer.sep_token_id)
+        subwords = torch.LongTensor(subwords)
+        # Add "O" tags for the first and last subwords
+        tags = torch.Tensor(tags)
+        tags = torch.column_stack((
+            torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
+            tags,
+            torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
+        )).unsqueeze(0)
+        mask = torch.ones_like(tags)
+        return subwords, tags, tokens, mask, len(tokens)

Nested/nn/BaseModel.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from torch import nn
+from transformers import BertModel
+import logging
+logger = logging.getLogger(__name__)
+class BaseModel(nn.Module):
+    def __init__(self,
+                 bert_model="aubmindlab/bert-base-arabertv2",
+                 num_labels=2,
+                 dropout=0.1,
+                 num_types=0):
+        super().__init__()
+        self.bert_model = bert_model
+        self.num_labels = num_labels
+        self.num_types = num_types
+        self.dropout = dropout
+        self.bert = BertModel.from_pretrained(bert_model)
+        self.dropout = nn.Dropout(dropout)

Nested/nn/BertNestedTagger.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+from Nested.nn import BaseModel
+class BertNestedTagger(BaseModel):
+    def __init__(self, **kwargs):
+        super(BertNestedTagger, self).__init__(**kwargs)
+        self.max_num_labels = max(self.num_labels)
+        classifiers = [nn.Linear(768, num_labels) for num_labels in self.num_labels]
+        self.classifiers = torch.nn.Sequential(*classifiers)
+    def forward(self, x):
+        y = self.bert(x)
+        y = self.dropout(y["last_hidden_state"])
+        output = list()
+        for i, classifier in enumerate(self.classifiers):
+            logits = classifier(y)
+            # Pad logits to allow Multi-GPU/DataParallel training to work
+            # We will truncate the padded dimensions when we compute the loss in the trainer
+            logits = torch.nn.ConstantPad1d((0, self.max_num_labels - logits.shape[-1]), 0)(logits)
+            output.append(logits)
+        # Return tensor of the shape B x T x L x C
+        # B: batch size
+        # T: sequence length
+        # L: number of tag types
+        # C: number of classes per tag type
+        output = torch.stack(output).permute((1, 2, 0, 3))
+        return output

Nested/nn/BertSeqTagger.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import torch.nn as nn
 from transformers import BertModel
 class BertSeqTagger(nn.Module):
     def __init__(self, bert_model, num_labels=2, dropout=0.1):
         super().__init__()
         self.bert = BertModel.from_pretrained(bert_model)
         self.dropout = nn.Dropout(dropout)
         self.linear = nn.Linear(768, num_labels)
     def forward(self, x):
         y = self.bert(x)
         y = self.dropout(y["last_hidden_state"])
         logits = self.linear(y)
-        return logits

 import torch.nn as nn
 from transformers import BertModel
 class BertSeqTagger(nn.Module):
     def __init__(self, bert_model, num_labels=2, dropout=0.1):
         super().__init__()
         self.bert = BertModel.from_pretrained(bert_model)
         self.dropout = nn.Dropout(dropout)
         self.linear = nn.Linear(768, num_labels)
     def forward(self, x):
         y = self.bert(x)
         y = self.dropout(y["last_hidden_state"])
         logits = self.linear(y)
+        return logits

Nested/nn/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from Nested.nn.BaseModel import BaseModel
+from Nested.nn.BertSeqTagger import BertSeqTagger
+from Nested.nn.BertNestedTagger import BertNestedTagger

Nested/nn/__pycache__/BaseModel.cpython-311.pyc ADDED Viewed

Binary file (1.34 kB). View file

Nested/nn/__pycache__/BertNestedTagger.cpython-311.pyc ADDED Viewed

Binary file (2.33 kB). View file

Nested/nn/__pycache__/BertSeqTagger.cpython-311.pyc ADDED Viewed

Binary file (1.54 kB). View file

Nested/nn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (379 Bytes). View file

Nested/trainers/BaseTrainer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import torch
+import logging
+import natsort
+import glob
+logger = logging.getLogger(__name__)
+class BaseTrainer:
+    def __init__(
+        self,
+        model=None,
+        max_epochs=50,
+        optimizer=None,
+        scheduler=None,
+        loss=None,
+        train_dataloader=None,
+        val_dataloader=None,
+        test_dataloader=None,
+        log_interval=10,
+        summary_writer=None,
+        output_path=None,
+        clip=5,
+        patience=5
+    ):
+        self.model = model
+        self.max_epochs = max_epochs
+        self.train_dataloader = train_dataloader
+        self.val_dataloader = val_dataloader
+        self.test_dataloader = test_dataloader
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.loss = loss
+        self.log_interval = log_interval
+        self.summary_writer = summary_writer
+        self.output_path = output_path
+        self.current_timestep = 0
+        self.current_epoch = 0
+        self.clip = clip
+        self.patience = patience
+    def tag(self, dataloader, is_train=True):
+        """
+        Given a dataloader containing segments, predict the tags
+        :param dataloader: torch.utils.data.DataLoader
+        :param is_train: boolean - True for training model, False for evaluation
+        :return: Iterator
+                    subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
+                    gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
+                    tokens - List[Nested.data.dataset.Token] - list of tokens
+                    valid_len (B x 1) - int - valiud length of each sequence
+                    logits (B x T x NUM_LABELS) - logits for each token and each tag
+        """
+        for subwords, gold_tags, tokens, valid_len in dataloader:
+            self.model.train(is_train)
+            if torch.cuda.is_available():
+                subwords = subwords.cuda()
+                gold_tags = gold_tags.cuda()
+            if is_train:
+                self.optimizer.zero_grad()
+                logits = self.model(subwords)
+            else:
+                with torch.no_grad():
+                    logits = self.model(subwords)
+            yield subwords, gold_tags, tokens, valid_len, logits
+    def segments_to_file(self, segments, filename):
+        """
+        Write segments to file
+        :param segments: [List[Nested.data.dataset.Token]] - list of list of tokens
+        :param filename: str - output filename
+        :return: None
+        """
+        with open(filename, "w") as fh:
+            results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
+            fh.write("Token\tGold Tag\tPredicted Tag\n")
+            fh.write(results)
+            logging.info("Predictions written to %s", filename)
+    def save(self):
+        """
+        Save model checkpoint
+        :return:
+        """
+        filename = os.path.join(
+            self.output_path,
+            "checkpoints",
+            "checkpoint_{}.pt".format(self.current_epoch),
+        )
+        checkpoint = {
+            "model": self.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+            "epoch": self.current_epoch
+        }
+        logger.info("Saving checkpoint to %s", filename)
+        torch.save(checkpoint, filename)
+    def load(self, checkpoint_path):
+        """
+        Load model checkpoint
+        :param checkpoint_path: str - path/to/checkpoints
+        :return: None
+        """
+        checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
+        checkpoint_path = checkpoint_path[-1]
+        logger.info("Loading checkpoint %s", checkpoint_path)
+        device = None if torch.cuda.is_available() else torch.device('cpu')
+        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+        self.model.load_state_dict(checkpoint["model"])

Nested/trainers/BertNestedTrainer.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import logging
+import torch
+import numpy as np
+from Nested.trainers import BaseTrainer
+from Nested.utils.metrics import compute_nested_metrics
+logger = logging.getLogger(__name__)
+class BertNestedTrainer(BaseTrainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def train(self):
+        best_val_loss, test_loss = np.inf, np.inf
+        num_train_batch = len(self.train_dataloader)
+        num_labels = [len(v) for v in self.train_dataloader.dataset.vocab.tags[1:]]
+        patience = self.patience
+        for epoch_index in range(self.max_epochs):
+            self.current_epoch = epoch_index
+            train_loss = 0
+            for batch_index, (subwords, gold_tags, tokens, valid_len, logits) in enumerate(self.tag(
+                self.train_dataloader, is_train=True
+            ), 1):
+                self.current_timestep += 1
+                # Compute loses for each output
+                # logits = B x T x L x C
+                losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
+                                    torch.reshape(gold_tags[:, i, :], (-1,)).long())
+                          for i, l in enumerate(num_labels)]
+                torch.autograd.backward(losses)
+                # Avoid exploding gradient by doing gradient clipping
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
+                self.optimizer.step()
+                self.scheduler.step()
+                batch_loss = sum(l.item() for l in losses)
+                train_loss += batch_loss
+                if self.current_timestep % self.log_interval == 0:
+                    logger.info(
+                        "Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
+                        epoch_index,
+                        batch_index,
+                        num_train_batch,
+                        self.current_timestep,
+                        self.optimizer.param_groups[0]['lr'],
+                        batch_loss
+                    )
+            train_loss /= num_train_batch
+            logger.info("** Evaluating on validation dataset **")
+            val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
+            val_metrics = compute_nested_metrics(segments, self.val_dataloader.dataset.transform.vocab.tags[1:])
+            epoch_summary_loss = {
+                "train_loss": train_loss,
+                "val_loss": val_loss
+            }
+            epoch_summary_metrics = {
+                "val_micro_f1": val_metrics.micro_f1,
+                "val_precision": val_metrics.precision,
+                "val_recall": val_metrics.recall
+            }
+            logger.info(
+                "Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
+                epoch_index,
+                self.current_timestep,
+                train_loss,
+                val_loss,
+                val_metrics.micro_f1
+            )
+            if val_loss < best_val_loss:
+                patience = self.patience
+                best_val_loss = val_loss
+                logger.info("** Validation improved, evaluating test data **")
+                test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
+                self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
+                test_metrics = compute_nested_metrics(segments, self.test_dataloader.dataset.transform.vocab.tags[1:])
+                epoch_summary_loss["test_loss"] = test_loss
+                epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
+                epoch_summary_metrics["test_precision"] = test_metrics.precision
+                epoch_summary_metrics["test_recall"] = test_metrics.recall
+                logger.info(
+                    f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
+                    epoch_index,
+                    self.current_timestep,
+                    test_loss,
+                    test_metrics.micro_f1
+                )
+                self.save()
+            else:
+                patience -= 1
+            # No improvements, terminating early
+            if patience == 0:
+                logger.info("Early termination triggered")
+                break
+            self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
+            self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
+    def tag(self, dataloader, is_train=True):
+        """
+        Given a dataloader containing segments, predict the tags
+        :param dataloader: torch.utils.data.DataLoader
+        :param is_train: boolean - True for training model, False for evaluation
+        :return: Iterator
+                    subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
+                    gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
+                    tokens - List[Nested.data.dataset.Token] - list of tokens
+                    valid_len (B x 1) - int - valiud length of each sequence
+                    logits (B x T x NUM_LABELS) - logits for each token and each tag
+        """
+        for subwords, gold_tags, tokens, mask, valid_len in dataloader:
+            self.model.train(is_train)
+            if torch.cuda.is_available():
+                subwords = subwords.cuda()
+                gold_tags = gold_tags.cuda()
+            if is_train:
+                self.optimizer.zero_grad()
+                logits = self.model(subwords)
+            else:
+                with torch.no_grad():
+                    logits = self.model(subwords)
+            yield subwords, gold_tags, tokens, valid_len, logits
+    def eval(self, dataloader):
+        golds, preds, segments, valid_lens = list(), list(), list(), list()
+        num_labels = [len(v) for v in dataloader.dataset.vocab.tags[1:]]
+        loss = 0
+        for _, gold_tags, tokens, valid_len, logits in self.tag(
+            dataloader, is_train=False
+        ):
+            losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
+                                torch.reshape(gold_tags[:, i, :], (-1,)).long())
+                      for i, l in enumerate(num_labels)]
+            loss += sum(losses)
+            preds += torch.argmax(logits, dim=3)
+            segments += tokens
+            valid_lens += list(valid_len)
+        loss /= len(dataloader)
+        # Update segments, attach predicted tags to each token
+        segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
+        return preds, segments, valid_lens, loss
+    def infer(self, dataloader):
+        golds, preds, segments, valid_lens = list(), list(), list(), list()
+        for _, gold_tags, tokens, valid_len, logits in self.tag(
+            dataloader, is_train=False
+        ):
+            preds += torch.argmax(logits, dim=3)
+            segments += tokens
+            valid_lens += list(valid_len)
+        segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
+        return segments
+    def to_segments(self, segments, preds, valid_lens, vocab):
+        if vocab is None:
+            vocab = self.vocab
+        tagged_segments = list()
+        tokens_stoi = vocab.tokens.get_stoi()
+        unk_id = tokens_stoi["UNK"]
+        for segment, pred, valid_len in zip(segments, preds, valid_lens):
+            # First, the token at 0th index [CLS] and token at nth index [SEP]
+            # Combine the tokens with their corresponding predictions
+            segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
+            # Ignore the sub-tokens/subwords, which are identified with text being UNK
+            segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
+            # Attach the predicted tags to each token
+            list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": vocab.get_itos()[tag_id]}
+                                                     for tag_id, vocab in zip(t[1].int().tolist(), vocab.tags[1:])]), segment_pred))
+            # We are only interested in the tagged tokens, we do no longer need raw model predictions
+            tagged_segment = [t for t, _ in segment_pred]
+            tagged_segments.append(tagged_segment)
+        return tagged_segments

Nested/trainers/BertTrainer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import os
+import logging
+import torch
+import numpy as np
+from Nested.trainers import BaseTrainer
+from Nested.utils.metrics import compute_single_label_metrics
+logger = logging.getLogger(__name__)
+class BertTrainer(BaseTrainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def train(self):
+        best_val_loss, test_loss = np.inf, np.inf
+        num_train_batch = len(self.train_dataloader)
+        patience = self.patience
+        for epoch_index in range(self.max_epochs):
+            self.current_epoch = epoch_index
+            train_loss = 0
+            for batch_index, (_, gold_tags, _, _, logits) in enumerate(self.tag(
+                self.train_dataloader, is_train=True
+            ), 1):
+                self.current_timestep += 1
+                batch_loss = self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
+                batch_loss.backward()
+                # Avoid exploding gradient by doing gradient clipping
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
+                self.optimizer.step()
+                self.scheduler.step()
+                train_loss += batch_loss.item()
+                if self.current_timestep % self.log_interval == 0:
+                    logger.info(
+                        "Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
+                        epoch_index,
+                        batch_index,
+                        num_train_batch,
+                        self.current_timestep,
+                        self.optimizer.param_groups[0]['lr'],
+                        batch_loss.item()
+                    )
+            train_loss /= num_train_batch
+            logger.info("** Evaluating on validation dataset **")
+            val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
+            val_metrics = compute_single_label_metrics(segments)
+            epoch_summary_loss = {
+                "train_loss": train_loss,
+                "val_loss": val_loss
+            }
+            epoch_summary_metrics = {
+                "val_micro_f1": val_metrics.micro_f1,
+                "val_precision": val_metrics.precision,
+                "val_recall": val_metrics.recall
+            }
+            logger.info(
+                "Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
+                epoch_index,
+                self.current_timestep,
+                train_loss,
+                val_loss,
+                val_metrics.micro_f1
+            )
+            if val_loss < best_val_loss:
+                patience = self.patience
+                best_val_loss = val_loss
+                logger.info("** Validation improved, evaluating test data **")
+                test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
+                self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
+                test_metrics = compute_single_label_metrics(segments)
+                epoch_summary_loss["test_loss"] = test_loss
+                epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
+                epoch_summary_metrics["test_precision"] = test_metrics.precision
+                epoch_summary_metrics["test_recall"] = test_metrics.recall
+                logger.info(
+                    f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
+                    epoch_index,
+                    self.current_timestep,
+                    test_loss,
+                    test_metrics.micro_f1
+                )
+                self.save()
+            else:
+                patience -= 1
+            # No improvements, terminating early
+            if patience == 0:
+                logger.info("Early termination triggered")
+                break
+            self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
+            self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
+    def eval(self, dataloader):
+        golds, preds, segments, valid_lens = list(), list(), list(), list()
+        loss = 0
+        for _, gold_tags, tokens, valid_len, logits in self.tag(
+            dataloader, is_train=False
+        ):
+            loss += self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
+            preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
+            segments += tokens
+            valid_lens += list(valid_len)
+        loss /= len(dataloader)
+        # Update segments, attach predicted tags to each token
+        segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
+        return preds, segments, valid_lens, loss.item()
+    def infer(self, dataloader):
+        golds, preds, segments, valid_lens = list(), list(), list(), list()
+        for _, gold_tags, tokens, valid_len, logits in self.tag(
+            dataloader, is_train=False
+        ):
+            preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
+            segments += tokens
+            valid_lens += list(valid_len)
+        segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
+        return segments
+    def to_segments(self, segments, preds, valid_lens, vocab):
+        if vocab is None:
+            vocab = self.vocab
+        tagged_segments = list()
+        tokens_stoi = vocab.tokens.get_stoi()
+        tags_itos = vocab.tags[0].get_itos()
+        unk_id = tokens_stoi["UNK"]
+        for segment, pred, valid_len in zip(segments, preds, valid_lens):
+            # First, the token at 0th index [CLS] and token at nth index [SEP]
+            # Combine the tokens with their corresponding predictions
+            segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
+            # Ignore the sub-tokens/subwords, which are identified with text being UNK
+            segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
+            # Attach the predicted tags to each token
+            list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": tags_itos[t[1]]}]), segment_pred))
+            # We are only interested in the tagged tokens, we do no longer need raw model predictions
+            tagged_segment = [t for t, _ in segment_pred]
+            tagged_segments.append(tagged_segment)
+        return tagged_segments

Nested/trainers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from Nested.trainers.BaseTrainer import BaseTrainer
+from Nested.trainers.BertTrainer import BertTrainer
+from Nested.trainers.BertNestedTrainer import BertNestedTrainer

Nested/trainers/__pycache__/BaseTrainer.cpython-311.pyc ADDED Viewed

Binary file (6.45 kB). View file

Nested/trainers/__pycache__/BertNestedTrainer.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

Nested/trainers/__pycache__/BertTrainer.cpython-311.pyc ADDED Viewed

Binary file (9.43 kB). View file

Nested/trainers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (405 Bytes). View file

Nested/utils/__init__.py ADDED Viewed

File without changes

Nested/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (155 Bytes). View file

Nested/utils/__pycache__/data.cpython-311.pyc ADDED Viewed

Binary file (8.66 kB). View file

Nested/utils/__pycache__/helpers.cpython-311.pyc ADDED Viewed

Binary file (5.9 kB). View file

Nested/utils/__pycache__/metrics.cpython-311.pyc ADDED Viewed

Binary file (5.45 kB). View file

Nested/utils/data.py CHANGED Viewed

@@ -1,7 +1,16 @@
-from collections import Counter
 class Vocab:
-    def _init_(self, counter, specials=[]) -> None:
         self.itos = list(counter.keys()) + specials
         self.stoi = {s: i for i, s in enumerate(self.itos)}
         self.word_count = counter
@@ -12,44 +21,77 @@ class Vocab:
     def get_stoi(self) -> dict[str, int]:
         return self.stoi
-    def _len_(self):
         return len(self.itos)
-class Token:
-    def __init__(self, text=None, pred_tag=None, gold_tag=None):
-        """
-        Token object to hold token attributes
-        :param text: str
-        :param pred_tag: str
-        :param gold_tag: str
-        """
-        self.text = text
-        self.gold_tag = gold_tag
-        self.pred_tag = pred_tag
-        self.subwords = None
-    @property
-    def subwords(self):
-        return self._subwords
-    @subwords.setter
-    def subwords(self, value):
-        self._subwords = value
-    def __str__(self):
-        """
-        Token text representation
-        :return: str
-        """
-        gold_tags = "|".join(self.gold_tag)
-        if self.pred_tag:
-            pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
-        else:
-            pred_tags = ""
-        if self.gold_tag:
-            r = f"{self.text}\t{gold_tags}\t{pred_tags}"
-        else:
-            r = f"{self.text}\t{pred_tags}"
-        return r
 def text2segments(text):
     """
@@ -57,6 +99,38 @@ def text2segments(text):
     """
     dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
     tokens = [token.text for segment in dataset for token in segment]
     # Generate vocabs for the tokens
     segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
-    return dataset, segment_vocab

+from torch.utils.data import DataLoader
+from collections import Counter, namedtuple
+import logging
+import re
+import itertools
+from Nested.utils.helpers import load_object
+from Nested.data.datasets import Token
+logger = logging.getLogger(__name__)
 class Vocab:
+    def __init__(self, counter, specials=[]) -> None:
         self.itos = list(counter.keys()) + specials
         self.stoi = {s: i for i, s in enumerate(self.itos)}
         self.word_count = counter
     def get_stoi(self) -> dict[str, int]:
         return self.stoi
+    def __len__(self):
         return len(self.itos)
+def conll_to_segments(filename):
+    """
+    Convert CoNLL files to segments. This return list of segments and each segment is
+    a list of tuples (token, tag)
+    :param filename: Path
+    :return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
+    """
+    segments, segment = list(), list()
+    with open(filename, "r") as fh:
+        for token in fh.read().splitlines():
+            if not token.strip():
+                segments.append(segment)
+                segment = list()
+            else:
+                parts = token.split()
+                token = Token(text=parts[0], gold_tag=parts[1:])
+                segment.append(token)
+        segments.append(segment)
+    return segments
+def parse_conll_files(data_paths):
+    """
+    Parse CoNLL formatted files and return list of segments for each file and index
+    the vocabs and tags across all data_paths
+    :param data_paths: tuple(Path) - tuple of filenames
+    :return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
+                    [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
+                    ...
+                  )
+             List of segments for each dataset and each segment has list of (tokens, tags)
+    """
+    vocabs = namedtuple("Vocab", ["tags", "tokens"])
+    datasets, tags, tokens = list(), list(), list()
+    for data_path in data_paths:
+        dataset = conll_to_segments(data_path)
+        datasets.append(dataset)
+        tokens += [token.text for segment in dataset for token in segment]
+        tags += [token.gold_tag for segment in dataset for token in segment]
+    # Flatten list of tags
+    tags = list(itertools.chain(*tags))
+    # Generate vocabs for tags and tokens
+    tag_vocabs = tag_vocab_by_type(tags)
+    tag_vocabs.insert(0, Vocab(Counter(tags)))
+    vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
+    return tuple(datasets), vocabs
+def tag_vocab_by_type(tags):
+    vocabs = list()
+    c = Counter(tags)
+    tag_names = c.keys()
+    tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
+    for tag_type in tag_types:
+        r = re.compile(".*-" + tag_type + "$")
+        t = list(filter(r.match, tags)) + ["O"]
+        vocabs.append(Vocab(Counter(t)))
+    return vocabs
 def text2segments(text):
     """
     """
     dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
     tokens = [token.text for segment in dataset for token in segment]
     # Generate vocabs for the tokens
     segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
+    return dataset, segment_vocab
+def get_dataloaders(
+    datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
+):
+    """
+    From the datasets generate the dataloaders
+    :param datasets: list - list of the datasets, list of list of segments and tokens
+    :param batch_size: int
+    :param num_workers: int
+    :param shuffle: boolean - to shuffle the data or not
+    :return: List[torch.utils.data.DataLoader]
+    """
+    dataloaders = list()
+    for i, examples in enumerate(datasets):
+        data_config["kwargs"].update({"examples": examples, "vocab": vocab})
+        dataset = load_object(data_config["fn"], data_config["kwargs"])
+        dataloader = DataLoader(
+            dataset=dataset,
+            shuffle=shuffle[i],
+            batch_size=batch_size,
+            num_workers=num_workers,
+            collate_fn=dataset.collate_fn,
+        )
+        logger.info("%s batches found", len(dataloader))
+        dataloaders.append(dataloader)
+    return dataloaders

Nested/utils/helpers.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import sys
+import logging
+import importlib
+import shutil
+import torch
+import pickle
+import json
+import random
+import numpy as np
+from argparse import Namespace
+def logging_config(log_file=None):
+    """
+    Initialize custom logger
+    :param log_file: str - path to log file, full path
+    :return: None
+    """
+    handlers = [logging.StreamHandler(sys.stdout)]
+    if log_file:
+        handlers.append(logging.FileHandler(log_file, "w", "utf-8"))
+        print("Logging to {}".format(log_file))
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=handlers,
+        format="%(levelname)s\t%(name)s\t%(asctime)s\t%(message)s",
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        force=True
+    )
+def load_object(name, kwargs):
+    """
+    Load objects dynamically given the object name and its arguments
+    :param name: str - object name, class name or function name
+    :param kwargs: dict - keyword arguments
+    :return: object
+    """
+    object_module, object_name = name.rsplit(".", 1)
+    object_module = importlib.import_module(object_module)
+    fn = getattr(object_module, object_name)(**kwargs)
+    return fn
+def make_output_dirs(path, subdirs=[], overwrite=True):
+    """
+    Create root directory and any other sub-directories
+    :param path: str - root directory
+    :param subdirs: List[str] - list of sub-directories
+    :param overwrite: boolean - to overwrite the directory or not
+    :return: None
+    """
+    if overwrite:
+        shutil.rmtree(path, ignore_errors=True)
+    os.makedirs(path)
+    for subdir in subdirs:
+        os.makedirs(os.path.join(path, subdir))
+def load_checkpoint(model_path):
+    """
+    Load model given the model path
+    :param model_path: str - path to model
+    :return: tagger - Nested.trainers.BaseTrainer - the tagger model
+             vocab - arabicner.utils.data.Vocab - indexed tags
+             train_config - argparse.Namespace - training configurations
+    """
+    with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
+        tag_vocab = pickle.load(fh)
+    # Load train configurations from checkpoint
+    train_config = Namespace()
+    with open(os.path.join(model_path, "args.json"), "r") as fh:
+        train_config.__dict__ = json.load(fh)
+    # Initialize the loss function, not used for inference, but evaluation
+    loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
+    # Load BERT tagger
+    model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
+    model = torch.nn.DataParallel(model)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    # Update arguments for the tagger
+    # Attach the model, loss (used for evaluations cases)
+    train_config.trainer_config["kwargs"]["model"] = model
+    train_config.trainer_config["kwargs"]["loss"] = loss
+    tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
+    tagger.load(os.path.join(model_path, "checkpoints"))
+    return tagger, tag_vocab, train_config
+def set_seed(seed):
+    """
+    Set the seed for random intialization and set
+    CUDANN parameters to ensure determmihstic results across
+    multiple runs with the same seed
+    :param seed: int
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.enabled = False

Nested/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from seqeval.metrics import (
+    classification_report,
+    precision_score,
+    recall_score,
+    f1_score,
+    accuracy_score,
+)
+from seqeval.scheme import IOB2
+from types import SimpleNamespace
+import logging
+import re
+logger = logging.getLogger(__name__)
+def compute_nested_metrics(segments, vocabs):
+    """
+    Compute metrics for nested NER
+    :param segments: List[List[Nested.data.dataset.Token]] - list of segments
+    :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
+    """
+    y, y_hat = list(), list()
+    # We duplicate the dataset N times, where N is the number of entity types
+    # For each copy, we create y and y_hat
+    # Example: first copy, will create pairs of ground truth and predicted labels for entity type GPE
+    #          another copy will create pairs for LOC, etc.
+    for i, vocab in enumerate(vocabs):
+        vocab_tags = [tag for tag in vocab.get_itos() if "-" in tag]
+        r = re.compile("|".join(vocab_tags))
+        y += [[(list(filter(r.match, token.gold_tag)) or ["O"])[0] for token in segment] for segment in segments]
+        y_hat += [[token.pred_tag[i]["tag"] for token in segment] for segment in segments]
+    logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4))
+    metrics = {
+        "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
+        "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
+        "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
+        "precision": precision_score(y, y_hat, scheme=IOB2),
+        "recall": recall_score(y, y_hat, scheme=IOB2),
+        "accuracy": accuracy_score(y, y_hat),
+    }
+    return SimpleNamespace(**metrics)
+def compute_single_label_metrics(segments):
+    """
+    Compute metrics for flat NER
+    :param segments: List[List[Nested.data.dataset.Token]] - list of segments
+    :return: metrics - SimpleNamespace - F1/micro/macro/weights, recall, precision, accuracy
+    """
+    y = [[token.gold_tag[0] for token in segment] for segment in segments]
+    y_hat = [[token.pred_tag[0]["tag"] for token in segment] for segment in segments]
+    logging.info("\n" + classification_report(y, y_hat, scheme=IOB2, digits=4))
+    metrics = {
+        "micro_f1": f1_score(y, y_hat, average="micro", scheme=IOB2),
+        "macro_f1": f1_score(y, y_hat, average="macro", scheme=IOB2),
+        "weights_f1": f1_score(y, y_hat, average="weighted", scheme=IOB2),
+        "precision": precision_score(y, y_hat, scheme=IOB2),
+        "recall": recall_score(y, y_hat, scheme=IOB2),
+        "accuracy": accuracy_score(y, y_hat),
+    }
+    return SimpleNamespace(**metrics)