Spaces:

tyfsadik
/

Humanizer

Running

App Files Files Community

tyfsadik commited on 17 days ago

Commit

e38d58c

verified ·

1 Parent(s): dbe470d

Upload 7 files

Browse files

Files changed (7) hide show

gector/bert_token_embedder.py +269 -0
gector/datareader.py +151 -0
gector/gec_model.py +298 -0
gector/seq2labels_model.py +194 -0
gector/tokenization.py +181 -0
gector/tokenizer_indexer.py +161 -0
gector/trainer.py +845 -0

gector/bert_token_embedder.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Tweaked version of corresponding AllenNLP file"""
+import logging
+from copy import deepcopy
+from typing import Dict
+import torch
+import torch.nn.functional as F
+from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
+from allennlp.nn import util
+from transformers import AutoModel, PreTrainedModel
+logger = logging.getLogger(__name__)
+class PretrainedBertModel:
+    """
+    In some instances you may want to load the same BERT model twice
+    (e.g. to use as a token embedder and also as a pooling layer).
+    This factory provides a cache so that you don't actually have to load the model twice.
+    """
+    _cache: Dict[str, PreTrainedModel] = {}
+    @classmethod
+    def load(cls, model_name: str, cache_model: bool = True) -> PreTrainedModel:
+        if model_name in cls._cache:
+            return PretrainedBertModel._cache[model_name]
+        model = AutoModel.from_pretrained(model_name)
+        if cache_model:
+            cls._cache[model_name] = model
+        return model
+class BertEmbedder(TokenEmbedder):
+    """
+    A ``TokenEmbedder`` that produces BERT embeddings for your tokens.
+    Should be paired with a ``BertIndexer``, which produces wordpiece ids.
+    Most likely you probably want to use ``PretrainedBertEmbedder``
+    for one of the named pretrained models, not this base class.
+    Parameters
+    ----------
+    bert_model: ``BertModel``
+        The BERT model being wrapped.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    max_pieces : int, optional (default: 512)
+        The BERT embedder uses positional embeddings and so has a corresponding
+        maximum length for its input ids. Assuming the inputs are windowed
+        and padded appropriately by this length, the embedder will split them into a
+        large batch, feed them into BERT, and recombine the output as if it was a
+        longer sequence.
+    num_start_tokens : int, optional (default: 1)
+        The number of starting special tokens input to BERT (usually 1, i.e., [CLS])
+    num_end_tokens : int, optional (default: 1)
+        The number of ending tokens input to BERT (usually 1, i.e., [SEP])
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(
+        self,
+        bert_model: PreTrainedModel,
+        top_layer_only: bool = False,
+        max_pieces: int = 512,
+        num_start_tokens: int = 1,
+        num_end_tokens: int = 1
+    ) -> None:
+        super().__init__()
+        self.bert_model = deepcopy(bert_model)
+        self.output_dim = bert_model.config.hidden_size
+        self.max_pieces = max_pieces
+        self.num_start_tokens = num_start_tokens
+        self.num_end_tokens = num_end_tokens
+        self._scalar_mix = None
+    def set_weights(self, freeze):
+        for param in self.bert_model.parameters():
+            param.requires_grad = not freeze
+        return
+    def get_output_dim(self) -> int:
+        return self.output_dim
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        offsets: torch.LongTensor = None
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        input_ids : ``torch.LongTensor``
+            The (batch_size, ..., max_sequence_length) tensor of wordpiece ids.
+        offsets : ``torch.LongTensor``, optional
+            The BERT embeddings are one per wordpiece. However it's possible/likely
+            you might want one per original token. In that case, ``offsets``
+            represents the indices of the desired wordpiece for each original token.
+            Depending on how your token indexer is configured, this could be the
+            position of the last wordpiece for each token, or it could be the position
+            of the first wordpiece for each token.
+            For example, if you had the sentence "Definitely not", and if the corresponding
+            wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids
+            would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4].
+            If offsets are provided, the returned tensor will contain only the wordpiece
+            embeddings at those positions, and (in particular) will contain one embedding
+            per token. If offsets are not provided, the entire tensor of wordpiece embeddings
+            will be returned.
+        """
+        batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
+        initial_dims = list(input_ids.shape[:-1])
+        # The embedder may receive an input tensor that has a sequence length longer than can
+        # be fit. In that case, we should expect the wordpiece indexer to create padded windows
+        # of length `self.max_pieces` for us, and have them concatenated into one long sequence.
+        # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
+        # We can then split the sequence into sub-sequences of that length, and concatenate them
+        # along the batch dimension so we effectively have one huge batch of partial sentences.
+        # This can then be fed into BERT without any sentence length issues. Keep in mind
+        # that the memory consumption can dramatically increase for large batches with extremely
+        # long sentences.
+        needs_split = full_seq_len > self.max_pieces
+        last_window_size = 0
+        if needs_split:
+            # Split the flattened list by the window size, `max_pieces`
+            split_input_ids = list(input_ids.split(self.max_pieces, dim=-1))
+            # We want all sequences to be the same length, so pad the last sequence
+            last_window_size = split_input_ids[-1].size(-1)
+            padding_amount = self.max_pieces - last_window_size
+            split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
+            # Now combine the sequences along the batch dimension
+            input_ids = torch.cat(split_input_ids, dim=0)
+        input_mask = (input_ids != 0).long()
+        # input_ids may have extra dimensions, so we reshape down to 2-d
+        # before calling the BERT model and then reshape back at the end.
+        all_encoder_layers = self.bert_model(
+            input_ids=util.combine_initial_dims(input_ids),
+            attention_mask=util.combine_initial_dims(input_mask),
+        )[0]
+        if len(all_encoder_layers[0].shape) == 3:
+            all_encoder_layers = torch.stack(all_encoder_layers)
+        elif len(all_encoder_layers[0].shape) == 2:
+            all_encoder_layers = torch.unsqueeze(all_encoder_layers, dim=0)
+        if needs_split:
+            # First, unpack the output embeddings into one long sequence again
+            unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1)
+            unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2)
+            # Next, select indices of the sequence such that it will result in embeddings representing the original
+            # sentence. To capture maximal context, the indices will be the middle part of each embedded window
+            # sub-sequence (plus any leftover start and final edge windows), e.g.,
+            #  0     1 2    3  4   5    6    7     8     9   10   11   12    13 14  15
+            # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]"
+            # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start
+            # and final windows with indices [0, 1] and [14, 15] respectively.
+            # Find the stride as half the max pieces, ignoring the special start and end tokens
+            # Calculate an offset to extract the centermost embeddings of each window
+            stride = (self.max_pieces - self.num_start_tokens - self.num_end_tokens) // 2
+            stride_offset = stride // 2 + self.num_start_tokens
+            first_window = list(range(stride_offset))
+            max_context_windows = [
+                i
+                for i in range(full_seq_len)
+                if stride_offset - 1 < i % self.max_pieces < stride_offset + stride
+            ]
+            # Lookback what's left, unless it's the whole self.max_pieces window
+            if full_seq_len % self.max_pieces == 0:
+                lookback = self.max_pieces
+            else:
+                lookback = full_seq_len % self.max_pieces
+            final_window_start = full_seq_len - lookback + stride_offset + stride
+            final_window = list(range(final_window_start, full_seq_len))
+            select_indices = first_window + max_context_windows + final_window
+            initial_dims.append(len(select_indices))
+            recombined_embeddings = unpacked_embeddings[:, :, select_indices]
+        else:
+            recombined_embeddings = all_encoder_layers
+        # Recombine the outputs of all layers
+        # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+        # recombined = torch.cat(combined, dim=2)
+        input_mask = (recombined_embeddings != 0).long()
+        if self._scalar_mix is not None:
+            mix = self._scalar_mix(recombined_embeddings, input_mask)
+        else:
+            mix = recombined_embeddings[-1]
+        # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+        if offsets is None:
+            # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
+            dims = initial_dims if needs_split else input_ids.size()
+            return util.uncombine_initial_dims(mix, dims)
+        else:
+            # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
+            offsets2d = util.combine_initial_dims(offsets)
+            # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
+            range_vector = util.get_range_vector(
+                offsets2d.size(0), device=util.get_device_of(mix)
+            ).unsqueeze(1)
+            # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
+            selected_embeddings = mix[range_vector, offsets2d]
+            return util.uncombine_initial_dims(selected_embeddings, offsets.size())
+# @TokenEmbedder.register("bert-pretrained")
+class PretrainedBertEmbedder(BertEmbedder):
+    """
+    Parameters
+    ----------
+    pretrained_model: ``str``
+        Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
+        or the path to the .tar.gz file with the model weights.
+        If the name is a key in the list of pretrained models at
+        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41
+        the corresponding path will be used; otherwise it will be interpreted as a path or URL.
+    requires_grad : ``bool``, optional (default = False)
+        If True, compute gradient of BERT parameters for fine tuning.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(
+        self,
+        pretrained_model: str,
+        requires_grad: bool = False,
+        top_layer_only: bool = False,
+        special_tokens_fix: int = 0,
+    ) -> None:
+        model = PretrainedBertModel.load(pretrained_model)
+        for param in model.parameters():
+            param.requires_grad = requires_grad
+        super().__init__(
+            bert_model=model,
+            top_layer_only=top_layer_only
+        )
+        if special_tokens_fix:
+            try:
+                vocab_size = self.bert_model.embeddings.word_embeddings.num_embeddings
+            except AttributeError:
+                # reserve more space
+                vocab_size = self.bert_model.word_embedding.num_embeddings + 5
+            self.bert_model.resize_token_embeddings(vocab_size + 1)

gector/datareader.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Tweaked AllenNLP dataset reader."""
+import logging
+import re
+from random import random
+from typing import Dict, List
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Token
+from overrides import overrides
+from utils.helpers import SEQ_DELIMETERS, START_TOKEN
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+@DatasetReader.register("seq2labels_datareader")
+class Seq2LabelsDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+    WORD###TAG [TAB] WORD###TAG [TAB] ..... \n
+    and converts it into a ``Dataset`` suitable for sequence tagging. You can also specify
+    alternative delimiters in the constructor.
+    Parameters
+    ----------
+    delimiters: ``dict``
+        The dcitionary with all delimeters.
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+        Note that the `output` tags will always correspond to single token IDs based on how they
+        are pre-tokenised in the data file.
+    max_len: if set than will truncate long sentences
+    """
+    # fix broken sentences mostly in Lang8
+    BROKEN_SENTENCES_REGEXP = re.compile(r'\.[a-zA-RT-Z]')
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 delimeters: dict = SEQ_DELIMETERS,
+                 skip_correct: bool = False,
+                 skip_complex: int = 0,
+                 lazy: bool = False,
+                 max_len: int = None,
+                 test_mode: bool = False,
+                 tag_strategy: str = "keep_one",
+                 tn_prob: float = 0,
+                 tp_prob: float = 0,
+                 broken_dot_strategy: str = "keep") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        self._delimeters = delimeters
+        self._max_len = max_len
+        self._skip_correct = skip_correct
+        self._skip_complex = skip_complex
+        self._tag_strategy = tag_strategy
+        self._broken_dot_strategy = broken_dot_strategy
+        self._test_mode = test_mode
+        self._tn_prob = tn_prob
+        self._tp_prob = tp_prob
+    @overrides
+    def _read(self, file_path):
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+            for line in data_file:
+                line = line.strip("\n")
+                # skip blank and broken lines
+                if not line or (not self._test_mode and self._broken_dot_strategy == 'skip'
+                                and self.BROKEN_SENTENCES_REGEXP.search(line) is not None):
+                    continue
+                tokens_and_tags = [pair.rsplit(self._delimeters['labels'], 1)
+                                   for pair in line.split(self._delimeters['tokens'])]
+                try:
+                    tokens = [Token(token) for token, tag in tokens_and_tags]
+                    tags = [tag for token, tag in tokens_and_tags]
+                except ValueError:
+                    tokens = [Token(token[0]) for token in tokens_and_tags]
+                    tags = None
+                if tokens and tokens[0] != Token(START_TOKEN):
+                    tokens = [Token(START_TOKEN)] + tokens
+                words = [x.text for x in tokens]
+                if self._max_len is not None:
+                    tokens = tokens[:self._max_len]
+                    tags = None if tags is None else tags[:self._max_len]
+                instance = self.text_to_instance(tokens, tags, words)
+                if instance:
+                    yield instance
+    def extract_tags(self, tags: List[str]):
+        op_del = self._delimeters['operations']
+        labels = [x.split(op_del) for x in tags]
+        comlex_flag_dict = {}
+        # get flags
+        for i in range(5):
+            idx = i + 1
+            comlex_flag_dict[idx] = sum([len(x) > idx for x in labels])
+        if self._tag_strategy == "keep_one":
+            # get only first candidates for r_tags in right and the last for left
+            labels = [x[0] for x in labels]
+        elif self._tag_strategy == "merge_all":
+            # consider phrases as a words
+            pass
+        else:
+            raise Exception("Incorrect tag strategy")
+        detect_tags = ["CORRECT" if label == "$KEEP" else "INCORRECT" for label in labels]
+        return labels, detect_tags, comlex_flag_dict
+    def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
+                         words: List[str] = None) -> Instance:  # type: ignore
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+        # pylint: disable=arguments-differ
+        fields: Dict[str, Field] = {}
+        sequence = TextField(tokens, self._token_indexers)
+        fields["tokens"] = sequence
+        fields["metadata"] = MetadataField({"words": words})
+        if tags is not None:
+            labels, detect_tags, complex_flag_dict = self.extract_tags(tags)
+            if self._skip_complex and complex_flag_dict[self._skip_complex] > 0:
+                return None
+            rnd = random()
+            # skip TN
+            if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
+                if rnd > self._tn_prob:
+                    return None
+            # skip TP
+            else:
+                if rnd > self._tp_prob:
+                    return None
+            fields["labels"] = SequenceLabelField(labels, sequence,
+                                                  label_namespace="labels")
+            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
+                                                  label_namespace="d_tags")
+        return Instance(fields)

gector/gec_model.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""Wrapper of AllenNLP model. Fixes errors based on model predictions"""
+import logging
+import os
+import sys
+from time import time
+import torch
+from allennlp.data.dataset import Batch
+from allennlp.data.fields import TextField
+from allennlp.data.instance import Instance
+from allennlp.data.tokenizers import Token
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.nn import util
+from gector.bert_token_embedder import PretrainedBertEmbedder
+from gector.seq2labels_model import Seq2Labels
+from gector.tokenizer_indexer import PretrainedBertIndexer
+from utils.helpers import PAD, UNK, get_target_sent_by_edits, START_TOKEN
+from utils.helpers import get_weights_name
+logging.getLogger("werkzeug").setLevel(logging.ERROR)
+logger = logging.getLogger(__file__)
+class GecBERTModel(object):
+    def __init__(self, vocab_path=None, model_paths=None,
+                 weigths=None,
+                 max_len=50,
+                 min_len=3,
+                 lowercase_tokens=False,
+                 log=False,
+                 iterations=3,
+                 model_name='roberta',
+                 special_tokens_fix=1,
+                 is_ensemble=True,
+                 min_error_probability=0.0,
+                 confidence=0,
+                 del_confidence=0,
+                 resolve_cycles=False,
+                 ):
+        self.model_weights = list(map(float, weigths)) if weigths else [1] * len(model_paths)
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.max_len = max_len
+        self.min_len = min_len
+        self.lowercase_tokens = lowercase_tokens
+        self.min_error_probability = min_error_probability
+        self.vocab = Vocabulary.from_files(vocab_path)
+        self.log = log
+        self.iterations = iterations
+        self.confidence = confidence
+        self.del_conf = del_confidence
+        self.resolve_cycles = resolve_cycles
+        # set training parameters and operations
+        self.indexers = []
+        self.models = []
+        for model_path in model_paths:
+            if is_ensemble:
+                model_name, special_tokens_fix = self._get_model_data(model_path)
+            weights_name = get_weights_name(model_name, lowercase_tokens)
+            self.indexers.append(self._get_indexer(weights_name, special_tokens_fix))
+            model = Seq2Labels(vocab=self.vocab,
+                               text_field_embedder=self._get_embbeder(weights_name, special_tokens_fix),
+                               confidence=self.confidence,
+                               del_confidence=self.del_conf,
+                               ).to(self.device)
+            if torch.cuda.is_available():
+                model.load_state_dict(torch.load(model_path), strict=False)
+            else:
+                model.load_state_dict(torch.load(model_path,
+                                                 map_location=torch.device('cpu')),
+                                                 strict=False)
+            model.eval()
+            self.models.append(model)
+    @staticmethod
+    def _get_model_data(model_path):
+        model_name = model_path.split('/')[-1]
+        tr_model, stf = model_name.split('_')[:2]
+        return tr_model, int(stf)
+    def _restore_model(self, input_path):
+        if os.path.isdir(input_path):
+            print("Model could not be restored from directory", file=sys.stderr)
+            filenames = []
+        else:
+            filenames = [input_path]
+        for model_path in filenames:
+            try:
+                if torch.cuda.is_available():
+                    loaded_model = torch.load(model_path)
+                else:
+                    loaded_model = torch.load(model_path,
+                                              map_location=lambda storage,
+                                                                  loc: storage)
+            except:
+                print(f"{model_path} is not valid model", file=sys.stderr)
+            own_state = self.model.state_dict()
+            for name, weights in loaded_model.items():
+                if name not in own_state:
+                    continue
+                try:
+                    if len(filenames) == 1:
+                        own_state[name].copy_(weights)
+                    else:
+                        own_state[name] += weights
+                except RuntimeError:
+                    continue
+        print("Model is restored", file=sys.stderr)
+    def predict(self, batches):
+        t11 = time()
+        predictions = []
+        for batch, model in zip(batches, self.models):
+            batch = util.move_to_device(batch.as_tensor_dict(), 0 if torch.cuda.is_available() else -1)
+            with torch.no_grad():
+                prediction = model.forward(**batch)
+            predictions.append(prediction)
+        preds, idx, error_probs = self._convert(predictions)
+        t55 = time()
+        if self.log:
+            print(f"Inference time {t55 - t11}")
+        return preds, idx, error_probs
+    def get_token_action(self, token, index, prob, sugg_token):
+        """Get lost of suggested actions for token."""
+        # cases when we don't need to do anything
+        if prob < self.min_error_probability or sugg_token in [UNK, PAD, '$KEEP']:
+            return None
+        if sugg_token.startswith('$REPLACE_') or sugg_token.startswith('$TRANSFORM_') or sugg_token == '$DELETE':
+            start_pos = index
+            end_pos = index + 1
+        elif sugg_token.startswith("$APPEND_") or sugg_token.startswith("$MERGE_"):
+            start_pos = index + 1
+            end_pos = index + 1
+        if sugg_token == "$DELETE":
+            sugg_token_clear = ""
+        elif sugg_token.startswith('$TRANSFORM_') or sugg_token.startswith("$MERGE_"):
+            sugg_token_clear = sugg_token[:]
+        else:
+            sugg_token_clear = sugg_token[sugg_token.index('_') + 1:]
+        return start_pos - 1, end_pos - 1, sugg_token_clear, prob
+    def _get_embbeder(self, weigths_name, special_tokens_fix):
+        embedders = {'bert': PretrainedBertEmbedder(
+            pretrained_model=weigths_name,
+            requires_grad=False,
+            top_layer_only=True,
+            special_tokens_fix=special_tokens_fix)
+        }
+        text_field_embedder = BasicTextFieldEmbedder(
+            token_embedders=embedders,
+            embedder_to_indexer_map={"bert": ["bert", "bert-offsets"]},
+            allow_unmatched_keys=True)
+        return text_field_embedder
+    def _get_indexer(self, weights_name, special_tokens_fix):
+        bert_token_indexer = PretrainedBertIndexer(
+            pretrained_model=weights_name,
+            do_lowercase=self.lowercase_tokens,
+            max_pieces_per_token=5,
+            special_tokens_fix=special_tokens_fix
+        )
+        return {'bert': bert_token_indexer}
+    def preprocess(self, token_batch):
+        seq_lens = [len(sequence) for sequence in token_batch if sequence]
+        if not seq_lens:
+            return []
+        max_len = min(max(seq_lens), self.max_len)
+        batches = []
+        for indexer in self.indexers:
+            batch = []
+            for sequence in token_batch:
+                tokens = sequence[:max_len]
+                tokens = [Token(token) for token in ['$START'] + tokens]
+                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
+            batch = Batch(batch)
+            batch.index_instances(self.vocab)
+            batches.append(batch)
+        return batches
+    def _convert(self, data):
+        all_class_probs = torch.zeros_like(data[0]['class_probabilities_labels'])
+        error_probs = torch.zeros_like(data[0]['max_error_probability'])
+        for output, weight in zip(data, self.model_weights):
+            all_class_probs += weight * output['class_probabilities_labels'] / sum(self.model_weights)
+            error_probs += weight * output['max_error_probability'] / sum(self.model_weights)
+        max_vals = torch.max(all_class_probs, dim=-1)
+        probs = max_vals[0].tolist()
+        idx = max_vals[1].tolist()
+        return probs, idx, error_probs.tolist()
+    def update_final_batch(self, final_batch, pred_ids, pred_batch,
+                           prev_preds_dict):
+        new_pred_ids = []
+        total_updated = 0
+        for i, orig_id in enumerate(pred_ids):
+            orig = final_batch[orig_id]
+            pred = pred_batch[i]
+            prev_preds = prev_preds_dict[orig_id]
+            if orig != pred and pred not in prev_preds:
+                final_batch[orig_id] = pred
+                new_pred_ids.append(orig_id)
+                prev_preds_dict[orig_id].append(pred)
+                total_updated += 1
+            elif orig != pred and pred in prev_preds:
+                # update final batch, but stop iterations
+                final_batch[orig_id] = pred
+                total_updated += 1
+            else:
+                continue
+        return final_batch, new_pred_ids, total_updated
+    def postprocess_batch(self, batch, all_probabilities, all_idxs,
+                          error_probs):
+        all_results = []
+        noop_index = self.vocab.get_token_index("$KEEP", "labels")
+        for tokens, probabilities, idxs, error_prob in zip(batch,
+                                                           all_probabilities,
+                                                           all_idxs,
+                                                           error_probs):
+            length = min(len(tokens), self.max_len)
+            edits = []
+            # skip whole sentences if there no errors
+            if max(idxs) == 0:
+                all_results.append(tokens)
+                continue
+            # skip whole sentence if probability of correctness is not high
+            if error_prob < self.min_error_probability:
+                all_results.append(tokens)
+                continue
+            for i in range(length + 1):
+                # because of START token
+                if i == 0:
+                    token = START_TOKEN
+                else:
+                    token = tokens[i - 1]
+                # skip if there is no error
+                if idxs[i] == noop_index:
+                    continue
+                sugg_token = self.vocab.get_token_from_index(idxs[i],
+                                                             namespace='labels')
+                action = self.get_token_action(token, i, probabilities[i],
+                                               sugg_token)
+                if not action:
+                    continue
+                edits.append(action)
+            all_results.append(get_target_sent_by_edits(tokens, edits))
+        return all_results
+    def handle_batch(self, full_batch):
+        """
+        Handle batch of requests.
+        """
+        final_batch = full_batch[:]
+        batch_size = len(full_batch)
+        prev_preds_dict = {i: [final_batch[i]] for i in range(len(final_batch))}
+        short_ids = [i for i in range(len(full_batch))
+                     if len(full_batch[i]) < self.min_len]
+        pred_ids = [i for i in range(len(full_batch)) if i not in short_ids]
+        total_updates = 0
+        for n_iter in range(self.iterations):
+            orig_batch = [final_batch[i] for i in pred_ids]
+            sequences = self.preprocess(orig_batch)
+            if not sequences:
+                break
+            probabilities, idxs, error_probs = self.predict(sequences)
+            pred_batch = self.postprocess_batch(orig_batch, probabilities,
+                                                idxs, error_probs)
+            if self.log:
+                print(f"Iteration {n_iter + 1}. Predicted {round(100*len(pred_ids)/batch_size, 1)}% of sentences.")
+            final_batch, pred_ids, cnt = \
+                self.update_final_batch(final_batch, pred_ids, pred_batch,
+                                        prev_preds_dict)
+            total_updates += cnt
+            if not pred_ids:
+                break
+        return final_batch, total_updates

gector/seq2labels_model.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""Basic model. Predicts tags for every token"""
+from typing import Dict, Optional, List, Any
+import numpy
+import torch
+import torch.nn.functional as F
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import TimeDistributed, TextFieldEmbedder
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
+from allennlp.training.metrics import CategoricalAccuracy
+from overrides import overrides
+from torch.nn.modules.linear import Linear
+@Model.register("seq2labels")
+class Seq2Labels(Model):
+    """
+    This ``Seq2Labels`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then
+    predicts a tag (or couple tags) for each token in the sequence.
+    Parameters
+    ----------
+    vocab : ``Vocabulary``, required
+        A Vocabulary, required in order to compute sizes for input/output projections.
+    text_field_embedder : ``TextFieldEmbedder``, required
+        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
+    encoder : ``Seq2SeqEncoder``
+        The encoder (with its own internal stacking) that we will use in between embedding tokens
+        and predicting output tags.
+    calculate_span_f1 : ``bool``, optional (default=``None``)
+        Calculate span-level F1 metrics during training. If this is ``True``, then
+        ``label_encoding`` is required. If ``None`` and
+        label_encoding is specified, this is set to ``True``.
+        If ``None`` and label_encoding is not specified, it defaults
+        to ``False``.
+    label_encoding : ``str``, optional (default=``None``)
+        Label encoding to use when calculating span f1.
+        Valid options are "BIO", "BIOUL", "IOB1", "BMES".
+        Required if ``calculate_span_f1`` is true.
+    labels_namespace : ``str``, optional (default=``labels``)
+        This is needed to compute the SpanBasedF1Measure metric, if desired.
+        Unless you did something unusual, the default value should be what you want.
+    verbose_metrics : ``bool``, optional (default = False)
+        If true, metrics will be returned per label class in addition
+        to the overall statistics.
+    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
+        Used to initialize the model parameters.
+    regularizer : ``RegularizerApplicator``, optional (default=``None``)
+        If provided, will be used to calculate the regularization penalty during training.
+    """
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 predictor_dropout=0.0,
+                 labels_namespace: str = "labels",
+                 detect_namespace: str = "d_tags",
+                 verbose_metrics: bool = False,
+                 label_smoothing: float = 0.0,
+                 confidence: float = 0.0,
+                 del_confidence: float = 0.0,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super(Seq2Labels, self).__init__(vocab, regularizer)
+        self.label_namespaces = [labels_namespace,
+                                 detect_namespace]
+        self.text_field_embedder = text_field_embedder
+        self.num_labels_classes = self.vocab.get_vocab_size(labels_namespace)
+        self.num_detect_classes = self.vocab.get_vocab_size(detect_namespace)
+        self.label_smoothing = label_smoothing
+        self.confidence = confidence
+        self.del_conf = del_confidence
+        self.incorr_index = self.vocab.get_token_index("INCORRECT",
+                                                       namespace=detect_namespace)
+        self._verbose_metrics = verbose_metrics
+        self.predictor_dropout = TimeDistributed(torch.nn.Dropout(predictor_dropout))
+        self.tag_labels_projection_layer = TimeDistributed(
+            Linear(text_field_embedder._token_embedders['bert'].get_output_dim(), self.num_labels_classes))
+        self.tag_detect_projection_layer = TimeDistributed(
+            Linear(text_field_embedder._token_embedders['bert'].get_output_dim(), self.num_detect_classes))
+        self.metrics = {"accuracy": CategoricalAccuracy()}
+        initializer(self)
+    @overrides
+    def forward(self,  # type: ignore
+                tokens: Dict[str, torch.LongTensor],
+                labels: torch.LongTensor = None,
+                d_tags: torch.LongTensor = None,
+                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        tokens : Dict[str, torch.LongTensor], required
+            The output of ``TextField.as_array()``, which should typically be passed directly to a
+            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
+            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
+            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
+            for the ``TokenIndexers`` when you created the ``TextField`` representing your
+            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
+            which knows how to combine different word representations into a single vector per
+            token in your input.
+        labels : torch.LongTensor, optional (default = None)
+            A torch tensor representing the sequence of integer gold class labels of shape
+            ``(batch_size, num_tokens)``.
+        d_tags : torch.LongTensor, optional (default = None)
+            A torch tensor representing the sequence of integer gold class labels of shape
+            ``(batch_size, num_tokens)``.
+        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
+            metadata containing the original words in the sentence to be tagged under a 'words' key.
+        Returns
+        -------
+        An output dictionary consisting of:
+        logits : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            unnormalised log probabilities of the tag classes.
+        class_probabilities : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            a distribution of the tag classes per word.
+        loss : torch.FloatTensor, optional
+            A scalar loss to be optimised.
+        """
+        encoded_text = self.text_field_embedder(tokens)
+        batch_size, sequence_length, _ = encoded_text.size()
+        mask = get_text_field_mask(tokens)
+        logits_labels = self.tag_labels_projection_layer(self.predictor_dropout(encoded_text))
+        logits_d = self.tag_detect_projection_layer(encoded_text)
+        class_probabilities_labels = F.softmax(logits_labels, dim=-1).view(
+            [batch_size, sequence_length, self.num_labels_classes])
+        class_probabilities_d = F.softmax(logits_d, dim=-1).view(
+            [batch_size, sequence_length, self.num_detect_classes])
+        error_probs = class_probabilities_d[:, :, self.incorr_index] * mask
+        incorr_prob = torch.max(error_probs, dim=-1)[0]
+        probability_change = [self.confidence, self.del_conf] + [0] * (self.num_labels_classes - 2)
+        class_probabilities_labels += torch.FloatTensor(probability_change).repeat(
+            (batch_size, sequence_length, 1)).to(class_probabilities_labels.device)
+        output_dict = {"logits_labels": logits_labels,
+                       "logits_d_tags": logits_d,
+                       "class_probabilities_labels": class_probabilities_labels,
+                       "class_probabilities_d_tags": class_probabilities_d,
+                       "max_error_probability": incorr_prob}
+        if labels is not None and d_tags is not None:
+            loss_labels = sequence_cross_entropy_with_logits(logits_labels, labels, mask,
+                                                             label_smoothing=self.label_smoothing)
+            loss_d = sequence_cross_entropy_with_logits(logits_d, d_tags, mask)
+            for metric in self.metrics.values():
+                metric(logits_labels, labels, mask.float())
+                metric(logits_d, d_tags, mask.float())
+            output_dict["loss"] = loss_labels + loss_d
+        if metadata is not None:
+            output_dict["words"] = [x["words"] for x in metadata]
+        return output_dict
+    @overrides
+    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Does a simple position-wise argmax over each token, converts indices to string labels, and
+        adds a ``"tags"`` key to the dictionary with the result.
+        """
+        for label_namespace in self.label_namespaces:
+            all_predictions = output_dict[f'class_probabilities_{label_namespace}']
+            all_predictions = all_predictions.cpu().data.numpy()
+            if all_predictions.ndim == 3:
+                predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
+            else:
+                predictions_list = [all_predictions]
+            all_tags = []
+            for predictions in predictions_list:
+                argmax_indices = numpy.argmax(predictions, axis=-1)
+                tags = [self.vocab.get_token_from_index(x, namespace=label_namespace)
+                        for x in argmax_indices]
+                all_tags.append(tags)
+            output_dict[f'{label_namespace}'] = all_tags
+        return output_dict
+    @overrides
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics_to_return = {metric_name: metric.get_metric(reset) for
+                             metric_name, metric in self.metrics.items()}
+        return metrics_to_return

gector/tokenization.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+from time import time
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+def get_bpe_groups(token_offsets, bpe_offsets, input_ids, max_bpe_pieces=5):
+    bpe_groups = []
+    last_used_bpe = 0
+    # find the size of offsets
+    if (0, 0) in bpe_offsets:
+        bpe_size = bpe_offsets.index((0, 0))
+    else:
+        bpe_size = len(bpe_offsets)
+    saved_ids = [i for i in range(len(input_ids))]
+    redundant_ids = []
+    for token_offset in token_offsets:
+        start_token, end_token = token_offset
+        bpe_group = []
+        mapping_is_found = False
+        for i in range(last_used_bpe, bpe_size):
+            start_bpe, end_bpe = bpe_offsets[i]
+            if start_bpe >= start_token and end_bpe <= end_token:
+                # check if bpe_group is satisfy max_bpe_pieces constraint
+                if len(bpe_group) < max_bpe_pieces:
+                    bpe_group.append(i)
+                else:
+                    redundant_ids.append(i)
+                last_used_bpe = i + 1
+                mapping_is_found = True
+            elif mapping_is_found:
+                # stop doing useless iterations
+                break
+            else:
+                continue
+        bpe_groups.append(bpe_group)
+    saved_ids = [i for i in saved_ids if i not in redundant_ids]
+    return bpe_groups, saved_ids
+def reduce_input_ids(input_ids, bpe_groups, saved_ids,
+                     max_bpe_length=80, max_bpe_pieces=5):
+    # check if sequence is satisfy max_bpe_length constraint
+    while len(saved_ids) > max_bpe_length:
+        max_bpe_pieces -= 1
+        for token_id in range(len(bpe_groups)):
+            if len(bpe_groups[token_id]) > max_bpe_pieces:
+                redundant_ids = bpe_groups[token_id][max_bpe_pieces:]
+                bpe_groups[token_id] = bpe_groups[token_id][:max_bpe_pieces]
+                saved_ids = [i for i in saved_ids if i not in redundant_ids]
+    # get offsets
+    reduced_ids = [input_ids[i] for i in saved_ids]
+    correct_offsets = []
+    idx = 0
+    for i, bpe_group in enumerate(bpe_groups):
+        norm_idx = min(idx, len(reduced_ids) - 1)
+        correct_offsets.append(norm_idx)
+        idx += len(bpe_group)
+    return reduced_ids, correct_offsets
+def get_offsets_and_reduce_input_ids(tokenizer_output, token_offset_list,
+                                     index_name="bert", max_bpe_length=80,
+                                     max_bpe_pieces=5):
+    timings = {"bpe": 0, "reduce": 0, "mask": 0}
+    output_ids, output_offsets, output_masks = [], [], []
+    for i, token_offsets in enumerate(token_offset_list):
+        input_ids = tokenizer_output['input_ids'][i]
+        t0 = time()
+        # get bpe level offsets
+        bpe_offsets = tokenizer_output['offset_mapping'][i]
+        bpe_groups, saved_ids = get_bpe_groups(token_offsets, bpe_offsets,
+                                               input_ids,
+                                               max_bpe_pieces=max_bpe_pieces)
+        t1 = time()
+        timings["bpe"] += t1 - t0
+        # reduce sequence length
+        reduced_ids, correct_offsets = reduce_input_ids(input_ids, bpe_groups,
+                                                        saved_ids,
+                                                        max_bpe_length=max_bpe_length,
+                                                        max_bpe_pieces=max_bpe_pieces)
+        t2 = time()
+        timings["reduce"] += t2 - t1
+        # get mask
+        bpe_mask = [1 for _ in correct_offsets]
+        output_ids.append(reduced_ids)
+        output_offsets.append(correct_offsets)
+        output_masks.append(bpe_mask)
+        t3 = time()
+        timings["mask"] += t3 - t2
+    # tt = sum(timings.values())
+    # timings = {k: f"{round(v * 100 / tt, 2)}%" for k, v in timings.items()}
+    # print(timings)
+    output = {index_name: output_ids,
+              f"{index_name}-offsets": output_offsets,
+              "mask": output_masks}
+    return output
+def get_offset_for_tokens(tokens):
+    sentence = " ".join(tokens)
+    token_offsets = []
+    end_idx = 0
+    for token in tokens:
+        idx = sentence[end_idx:].index(token) + end_idx
+        end_idx = idx + len(token)
+        offset = (idx, end_idx)
+        token_offsets.append(offset)
+    return token_offsets
+def get_token_offsets(batch):
+    token_offset_list = []
+    for tokens in batch:
+        token_offsets = get_offset_for_tokens(tokens)
+        token_offset_list.append(token_offsets)
+    return token_offset_list
+def pad_output(output, pad_idx=0):
+    padded_output = {}
+    for input_key in output.keys():
+        indexes = output[input_key]
+        max_len = max([len(x) for x in indexes])
+        padded_indexes = []
+        for index_list in indexes:
+            cur_len = len(index_list)
+            pad_len = max_len - cur_len
+            padded_indexes.append(index_list + [pad_idx] * pad_len)
+        padded_output[input_key] = padded_indexes
+    return padded_output
+def tokenize_batch(tokenizer, batch_tokens, index_name="bert",
+                   max_bpe_length=80, max_bpe_pieces=5):
+    timings = {}
+    t0 = time()
+    # get batch with sentences
+    batch_sentences = [" ".join(x) for x in batch_tokens]
+    # get token level offsets
+    token_offset_list = get_token_offsets(batch_tokens)
+    # token_offset_list = get_token_offsets_multi(batch_tokens)
+    t1 = time()
+    timings["offset_time"] = t1 - t0
+    # tokenize batch
+    tokenizer_output = tokenizer.batch_encode_plus(batch_sentences,
+                                                   pad_to_max_length=False,
+                                                   return_offsets_mapping=True,
+                                                   add_special_tokens=False)
+    t2 = time()
+    timings["tokenize_time"] = t2 - t1
+    # postprocess batch
+    output = get_offsets_and_reduce_input_ids(tokenizer_output,
+                                              token_offset_list,
+                                              index_name=index_name,
+                                              max_bpe_length=max_bpe_length,
+                                              max_bpe_pieces=max_bpe_pieces)
+    t3 = time()
+    timings["reduce_time"] = t3 - t2
+    # pad output
+    output = pad_output(output)
+    t4 = time()
+    timings["pading_time"] = t4 - t3
+    # tt = sum(timings.values())
+    # timings = {k:f"{round(v*100/tt, 2)}%" for k,v in timings.items()}
+    # print(timings)
+    return output

gector/tokenizer_indexer.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""Tweaked version of corresponding AllenNLP file"""
+import logging
+from collections import defaultdict
+from typing import Dict, List, Callable
+from allennlp.common.util import pad_sequence_to_length
+from allennlp.data.token_indexers.token_indexer import TokenIndexer
+from allennlp.data.tokenizers.token import Token
+from allennlp.data.vocabulary import Vocabulary
+from overrides import overrides
+from transformers import AutoTokenizer
+from utils.helpers import START_TOKEN
+from gector.tokenization import tokenize_batch
+import copy
+logger = logging.getLogger(__name__)
+# TODO(joelgrus): Figure out how to generate token_type_ids out of this token indexer.
+class TokenizerIndexer(TokenIndexer[int]):
+    """
+    A token indexer that does the wordpiece-tokenization (e.g. for BERT embeddings).
+    If you are using one of the pretrained BERT models, you'll want to use the ``PretrainedBertIndexer``
+    subclass rather than this base class.
+    Parameters
+    ----------
+    tokenizer : ``Callable[[str], List[str]]``
+        A function that does the actual tokenization.
+    max_pieces : int, optional (default: 512)
+        The BERT embedder uses positional embeddings and so has a corresponding
+        maximum length for its input ids. Any inputs longer than this will
+        either be truncated (default), or be split apart and batched using a
+        sliding window.
+    token_min_padding_length : ``int``, optional (default=``0``)
+        See :class:`TokenIndexer`.
+    """
+    def __init__(self,
+                 tokenizer: Callable[[str], List[str]],
+                 max_pieces: int = 512,
+                 max_pieces_per_token: int = 3,
+                 token_min_padding_length: int = 0) -> None:
+        super().__init__(token_min_padding_length)
+        # The BERT code itself does a two-step tokenization:
+        #    sentence -> [words], and then word -> [wordpieces]
+        # In AllenNLP, the first step is implemented as the ``BertBasicWordSplitter``,
+        # and this token indexer handles the second.
+        self.tokenizer = tokenizer
+        self.max_pieces_per_token = max_pieces_per_token
+        self.max_pieces = max_pieces
+        self.max_pieces_per_sentence = 80
+    @overrides
+    def tokens_to_indices(self, tokens: List[Token],
+                          vocabulary: Vocabulary,
+                          index_name: str) -> Dict[str, List[int]]:
+        text = [token.text for token in tokens]
+        batch_tokens = [text]
+        output_fast = tokenize_batch(self.tokenizer,
+                                     batch_tokens,
+                                     max_bpe_length=self.max_pieces,
+                                     max_bpe_pieces=self.max_pieces_per_token)
+        output_fast = {k: v[0] for k, v in output_fast.items()}
+        return output_fast
+    @overrides
+    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
+        # If we only use pretrained models, we don't need to do anything here.
+        pass
+    @overrides
+    def get_padding_token(self) -> int:
+        return 0
+    @overrides
+    def get_padding_lengths(self, token: int) -> Dict[str, int]:  # pylint: disable=unused-argument
+        return {}
+    @overrides
+    def pad_token_sequence(self,
+                           tokens: Dict[str, List[int]],
+                           desired_num_tokens: Dict[str, int],
+                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
+        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
+                for key, val in tokens.items()}
+    @overrides
+    def get_keys(self, index_name: str) -> List[str]:
+        """
+        We need to override this because the indexer generates multiple keys.
+        """
+        # pylint: disable=no-self-use
+        return [index_name, f"{index_name}-offsets", f"{index_name}-type-ids", "mask"]
+class PretrainedBertIndexer(TokenizerIndexer):
+    # pylint: disable=line-too-long
+    """
+    A ``TokenIndexer`` corresponding to a pretrained BERT model.
+    Parameters
+    ----------
+    pretrained_model: ``str``
+        Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
+        or the path to the .txt file with its vocabulary.
+        If the name is a key in the list of pretrained models at
+        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py#L33
+        the corresponding path will be used; otherwise it will be interpreted as a path or URL.
+    do_lowercase: ``bool``, optional (default = True)
+        Whether to lowercase the tokens before converting to wordpiece ids.
+    max_pieces: int, optional (default: 512)
+        The BERT embedder uses positional embeddings and so has a corresponding
+        maximum length for its input ids. Any inputs longer than this will
+        either be truncated (default), or be split apart and batched using a
+        sliding window.
+    """
+    def __init__(self,
+                 pretrained_model: str,
+                 do_lowercase: bool = True,
+                 max_pieces: int = 512,
+                 max_pieces_per_token: int = 5,
+                 special_tokens_fix: int = 0) -> None:
+        if pretrained_model.endswith("-cased") and do_lowercase:
+            logger.warning("Your BERT model appears to be cased, "
+                           "but your indexer is lowercasing tokens.")
+        elif pretrained_model.endswith("-uncased") and not do_lowercase:
+            logger.warning("Your BERT model appears to be uncased, "
+                           "but your indexer is not lowercasing tokens.")
+        model_name = copy.deepcopy(pretrained_model)
+        model_tokenizer = AutoTokenizer.from_pretrained(
+            model_name, do_lower_case=do_lowercase, do_basic_tokenize=False, use_fast=True)
+        # to adjust all tokenizers
+        if hasattr(model_tokenizer, 'encoder'):
+            model_tokenizer.vocab = model_tokenizer.encoder
+        if hasattr(model_tokenizer, 'sp_model'):
+            model_tokenizer.vocab = defaultdict(lambda: 1)
+            for i in range(model_tokenizer.sp_model.get_piece_size()):
+                model_tokenizer.vocab[model_tokenizer.sp_model.id_to_piece(i)] = i
+        if special_tokens_fix:
+            model_tokenizer.add_tokens([START_TOKEN])
+            model_tokenizer.vocab[START_TOKEN] = len(model_tokenizer) - 1
+        super().__init__(tokenizer=model_tokenizer,
+                         max_pieces=max_pieces,
+                         max_pieces_per_token=max_pieces_per_token
+                        )

gector/trainer.py ADDED Viewed

	@@ -0,0 +1,845 @@

+"""Tweaked version of corresponding AllenNLP file"""
+import datetime
+import logging
+import math
+import os
+import time
+import traceback
+from typing import Dict, Optional, List, Tuple, Union, Iterable, Any
+import torch
+import torch.optim.lr_scheduler
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError, parse_cuda_device
+from allennlp.common.tqdm import Tqdm
+from allennlp.common.util import dump_metrics, gpu_memory_mb, peak_memory_mb, lazy_groups_of
+from allennlp.data.instance import Instance
+from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
+from allennlp.models.model import Model
+from allennlp.nn import util as nn_util
+from allennlp.training import util as training_util
+from allennlp.training.checkpointer import Checkpointer
+from allennlp.training.learning_rate_schedulers import LearningRateScheduler
+from allennlp.training.metric_tracker import MetricTracker
+from allennlp.training.momentum_schedulers import MomentumScheduler
+from allennlp.training.moving_average import MovingAverage
+from allennlp.training.optimizers import Optimizer
+from allennlp.training.tensorboard_writer import TensorboardWriter
+from allennlp.training.trainer_base import TrainerBase
+logger = logging.getLogger(__name__)
+class Trainer(TrainerBase):
+    def __init__(
+        self,
+        model: Model,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler,
+        iterator: DataIterator,
+        train_dataset: Iterable[Instance],
+        validation_dataset: Optional[Iterable[Instance]] = None,
+        patience: Optional[int] = None,
+        validation_metric: str = "-loss",
+        validation_iterator: DataIterator = None,
+        shuffle: bool = True,
+        num_epochs: int = 20,
+        accumulated_batch_count: int = 1,
+        serialization_dir: Optional[str] = None,
+        num_serialized_models_to_keep: int = 20,
+        keep_serialized_model_every_num_seconds: int = None,
+        checkpointer: Checkpointer = None,
+        model_save_interval: float = None,
+        cuda_device: Union[int, List] = -1,
+        grad_norm: Optional[float] = None,
+        grad_clipping: Optional[float] = None,
+        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
+        momentum_scheduler: Optional[MomentumScheduler] = None,
+        summary_interval: int = 100,
+        histogram_interval: int = None,
+        should_log_parameter_statistics: bool = True,
+        should_log_learning_rate: bool = False,
+        log_batch_size_period: Optional[int] = None,
+        moving_average: Optional[MovingAverage] = None,
+        cold_step_count: int = 0,
+        cold_lr: float = 1e-3,
+        cuda_verbose_step=None,
+    ) -> None:
+        """
+        A trainer for doing supervised learning. It just takes a labeled dataset
+        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
+        for your model over some fixed number of epochs. You can also pass in a validation
+        dataset and enable early stopping. There are many other bells and whistles as well.
+        Parameters
+        ----------
+        model : ``Model``, required.
+            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
+            their ``forward`` method returns a dictionary with a "loss" key, containing a
+            scalar tensor representing the loss function to be optimized.
+            If you are training your model using GPUs, your model should already be
+            on the correct device. (If you use `Trainer.from_params` this will be
+            handled for you.)
+        optimizer : ``torch.nn.Optimizer``, required.
+            An instance of a Pytorch Optimizer, instantiated with the parameters of the
+            model to be optimized.
+        iterator : ``DataIterator``, required.
+            A method for iterating over a ``Dataset``, yielding padded indexed batches.
+        train_dataset : ``Dataset``, required.
+            A ``Dataset`` to train on. The dataset should have already been indexed.
+        validation_dataset : ``Dataset``, optional, (default = None).
+            A ``Dataset`` to evaluate on. The dataset should have already been indexed.
+        patience : Optional[int] > 0, optional (default=None)
+            Number of epochs to be patient before early stopping: the training is stopped
+            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
+            If None, early stopping is disabled.
+        validation_metric : str, optional (default="loss")
+            Validation metric to measure for whether to stop training using patience
+            and whether to serialize an ``is_best`` model each epoch. The metric name
+            must be prepended with either "+" or "-", which specifies whether the metric
+            is an increasing or decreasing function.
+        validation_iterator : ``DataIterator``, optional (default=None)
+            An iterator to use for the validation set.  If ``None``, then
+            use the training `iterator`.
+        shuffle: ``bool``, optional (default=True)
+            Whether to shuffle the instances in the iterator or not.
+        num_epochs : int, optional (default = 20)
+            Number of training epochs.
+        serialization_dir : str, optional (default=None)
+            Path to directory for saving and loading model files. Models will not be saved if
+            this parameter is not passed.
+        num_serialized_models_to_keep : ``int``, optional (default=20)
+            Number of previous model checkpoints to retain.  Default is to keep 20 checkpoints.
+            A value of None or -1 means all checkpoints will be kept.
+        keep_serialized_model_every_num_seconds : ``int``, optional (default=None)
+            If num_serialized_models_to_keep is not None, then occasionally it's useful to
+            save models at a given interval in addition to the last num_serialized_models_to_keep.
+            To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
+            between permanently saved checkpoints.  Note that this option is only used if
+            num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
+        checkpointer : ``Checkpointer``, optional (default=None)
+            An instance of class Checkpointer to use instead of the default. If a checkpointer is specified,
+            the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should
+            not be specified. The caller is responsible for initializing the checkpointer so that it is
+            consistent with serialization_dir.
+        model_save_interval : ``float``, optional (default=None)
+            If provided, then serialize models every ``model_save_interval``
+            seconds within single epochs.  In all cases, models are also saved
+            at the end of every epoch if ``serialization_dir`` is provided.
+        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
+            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
+        grad_norm : ``float``, optional, (default = None).
+            If provided, gradient norms will be rescaled to have a maximum of this value.
+        grad_clipping : ``float``, optional (default = ``None``).
+            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
+            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
+            that are not solved by using ``grad_norm``, you may need this.
+        learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None)
+            If specified, the learning rate will be decayed with respect to
+            this schedule at the end of each epoch (or batch, if the scheduler implements
+            the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`,
+            this will use the ``validation_metric`` provided to determine if learning has plateaued.
+            To support updating the learning rate on every batch, this can optionally implement
+            ``step_batch(batch_num_total)`` which updates the learning rate given the batch number.
+        momentum_scheduler : ``MomentumScheduler``, optional (default = None)
+            If specified, the momentum will be updated at the end of each batch or epoch
+            according to the schedule.
+        summary_interval: ``int``, optional, (default = 100)
+            Number of batches between logging scalars to tensorboard
+        histogram_interval : ``int``, optional, (default = ``None``)
+            If not None, then log histograms to tensorboard every ``histogram_interval`` batches.
+            When this parameter is specified, the following additional logging is enabled:
+                * Histograms of model parameters
+                * The ratio of parameter update norm to parameter norm
+                * Histogram of layer activations
+            We log histograms of the parameters returned by
+            ``model.get_parameters_for_histogram_tensorboard_logging``.
+            The layer activations are logged for any modules in the ``Model`` that have
+            the attribute ``should_log_activations`` set to ``True``.  Logging
+            histograms requires a number of GPU-CPU copies during training and is typically
+            slow, so we recommend logging histograms relatively infrequently.
+            Note: only Modules that return tensors, tuples of tensors or dicts
+            with tensors as values currently support activation logging.
+        should_log_parameter_statistics : ``bool``, optional, (default = True)
+            Whether to send parameter statistics (mean and standard deviation
+            of parameters and gradients) to tensorboard.
+        should_log_learning_rate : ``bool``, optional, (default = False)
+            Whether to send parameter specific learning rate to tensorboard.
+        log_batch_size_period : ``int``, optional, (default = ``None``)
+            If defined, how often to log the average batch size.
+        moving_average: ``MovingAverage``, optional, (default = None)
+            If provided, we will maintain moving averages for all parameters. During training, we
+            employ a shadow variable for each parameter, which maintains the moving average. During
+            evaluation, we backup the original parameters and assign the moving averages to corresponding
+            parameters. Be careful that when saving the checkpoint, we will save the moving averages of
+            parameters. This is necessary because we want the saved model to perform as well as the validated
+            model if we load it later. But this may cause problems if you restart the training from checkpoint.
+        """
+        super().__init__(serialization_dir, cuda_device)
+        # I am not calling move_to_gpu here, because if the model is
+        # not already on the GPU then the optimizer is going to be wrong.
+        self.model = model
+        self.iterator = iterator
+        self._validation_iterator = validation_iterator
+        self.shuffle = shuffle
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.train_data = train_dataset
+        self._validation_data = validation_dataset
+        self.accumulated_batch_count = accumulated_batch_count
+        self.cold_step_count = cold_step_count
+        self.cold_lr = cold_lr
+        self.cuda_verbose_step = cuda_verbose_step
+        if patience is None:  # no early stopping
+            if validation_dataset:
+                logger.warning(
+                    "You provided a validation dataset but patience was set to None, "
+                    "meaning that early stopping is disabled"
+                )
+        elif (not isinstance(patience, int)) or patience <= 0:
+            raise ConfigurationError(
+                '{} is an invalid value for "patience": it must be a positive integer '
+                "or None (if you want to disable early stopping)".format(patience)
+            )
+        # For tracking is_best_so_far and should_stop_early
+        self._metric_tracker = MetricTracker(patience, validation_metric)
+        # Get rid of + or -
+        self._validation_metric = validation_metric[1:]
+        self._num_epochs = num_epochs
+        if checkpointer is not None:
+            # We can't easily check if these parameters were passed in, so check against their default values.
+            # We don't check against serialization_dir since it is also used by the parent class.
+            if num_serialized_models_to_keep != 20 \
+                    or keep_serialized_model_every_num_seconds is not None:
+                raise ConfigurationError(
+                    "When passing a custom Checkpointer, you may not also pass in separate checkpointer "
+                    "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'."
+                )
+            self._checkpointer = checkpointer
+        else:
+            self._checkpointer = Checkpointer(
+                serialization_dir,
+                keep_serialized_model_every_num_seconds,
+                num_serialized_models_to_keep,
+            )
+        self._model_save_interval = model_save_interval
+        self._grad_norm = grad_norm
+        self._grad_clipping = grad_clipping
+        self._learning_rate_scheduler = learning_rate_scheduler
+        self._momentum_scheduler = momentum_scheduler
+        self._moving_average = moving_average
+        # We keep the total batch number as an instance variable because it
+        # is used inside a closure for the hook which logs activations in
+        # ``_enable_activation_logging``.
+        self._batch_num_total = 0
+        self._tensorboard = TensorboardWriter(
+            get_batch_num_total=lambda: self._batch_num_total,
+            serialization_dir=serialization_dir,
+            summary_interval=summary_interval,
+            histogram_interval=histogram_interval,
+            should_log_parameter_statistics=should_log_parameter_statistics,
+            should_log_learning_rate=should_log_learning_rate,
+        )
+        self._log_batch_size_period = log_batch_size_period
+        self._last_log = 0.0  # time of last logging
+        # Enable activation logging.
+        if histogram_interval is not None:
+            self._tensorboard.enable_activation_logging(self.model)
+    def rescale_gradients(self) -> Optional[float]:
+        return training_util.rescale_gradients(self.model, self._grad_norm)
+    def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
+        """
+        Does a forward pass on the given batches and returns the ``loss`` value in the result.
+        If ``for_training`` is `True` also applies regularization penalty.
+        """
+        if self._multiple_gpu:
+            output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
+        else:
+            assert len(batch_group) == 1
+            batch = batch_group[0]
+            batch = nn_util.move_to_device(batch, self._cuda_devices[0])
+            output_dict = self.model(**batch)
+        try:
+            loss = output_dict["loss"]
+            if for_training:
+                loss += self.model.get_regularization_penalty()
+        except KeyError:
+            if for_training:
+                raise RuntimeError(
+                    "The model you are trying to optimize does not contain a"
+                    " 'loss' key in the output of model.forward(inputs)."
+                )
+            loss = None
+        return loss
+    def _train_epoch(self, epoch: int) -> Dict[str, float]:
+        """
+        Trains one epoch and returns metrics.
+        """
+        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
+        peak_cpu_usage = peak_memory_mb()
+        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
+        gpu_usage = []
+        for gpu, memory in gpu_memory_mb().items():
+            gpu_usage.append((gpu, memory))
+            logger.info(f"GPU {gpu} memory usage MB: {memory}")
+        train_loss = 0.0
+        # Set the model to "train" mode.
+        self.model.train()
+        num_gpus = len(self._cuda_devices)
+        # Get tqdm for the training batches
+        raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
+        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
+        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data) / num_gpus)
+        residue = num_training_batches % self.accumulated_batch_count
+        self._last_log = time.time()
+        last_save_time = time.time()
+        batches_this_epoch = 0
+        if self._batch_num_total is None:
+            self._batch_num_total = 0
+        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())
+        logger.info("Training")
+        train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches)
+        cumulative_batch_size = 0
+        self.optimizer.zero_grad()
+        for batch_group in train_generator_tqdm:
+            batches_this_epoch += 1
+            self._batch_num_total += 1
+            batch_num_total = self._batch_num_total
+            iter_len = self.accumulated_batch_count \
+                if batches_this_epoch <= (num_training_batches - residue) else residue
+            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
+                print(f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}')
+                print(f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}')
+            try:
+                loss = self.batch_loss(batch_group, for_training=True) / iter_len
+            except RuntimeError as e:
+                print(e)
+                for x in batch_group:
+                    all_words = [len(y['words']) for y in x['metadata']]
+                    print(f"Total sents: {len(all_words)}. "
+                          f"Min {min(all_words)}. Max {max(all_words)}")
+                    for elem in ['labels', 'd_tags']:
+                        tt = x[elem]
+                        print(
+                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}")
+                    for elem in ["bert", "mask", "bert-offsets"]:
+                        tt = x['tokens'][elem]
+                        print(
+                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}")
+                raise e
+            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
+                print(f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}')
+                print(f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}')
+            if torch.isnan(loss):
+                raise ValueError("nan loss encountered")
+            loss.backward()
+            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
+                print(f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}')
+                print(f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}')
+            train_loss += loss.item() * iter_len
+            del batch_group, loss
+            torch.cuda.empty_cache()
+            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
+                print(f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}')
+                print(f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}')
+            batch_grad_norm = self.rescale_gradients()
+            # This does nothing if batch_num_total is None or you are using a
+            # scheduler which doesn't update per batch.
+            if self._learning_rate_scheduler:
+                self._learning_rate_scheduler.step_batch(batch_num_total)
+            if self._momentum_scheduler:
+                self._momentum_scheduler.step_batch(batch_num_total)
+            if self._tensorboard.should_log_histograms_this_batch():
+                # get the magnitude of parameter updates for logging
+                # We need a copy of current parameters to compute magnitude of updates,
+                # and copy them to CPU so large models won't go OOM on the GPU.
+                param_updates = {
+                    name: param.detach().cpu().clone()
+                    for name, param in self.model.named_parameters()
+                }
+                if batches_this_epoch % self.accumulated_batch_count == 0 or \
+                        batches_this_epoch == num_training_batches:
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+                for name, param in self.model.named_parameters():
+                    param_updates[name].sub_(param.detach().cpu())
+                    update_norm = torch.norm(param_updates[name].view(-1))
+                    param_norm = torch.norm(param.view(-1)).cpu()
+                    self._tensorboard.add_train_scalar(
+                        "gradient_update/" + name, update_norm / (param_norm + 1e-7)
+                    )
+            else:
+                if batches_this_epoch % self.accumulated_batch_count == 0 or \
+                        batches_this_epoch == num_training_batches:
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+            # Update moving averages
+            if self._moving_average is not None:
+                self._moving_average.apply(batch_num_total)
+            # Update the description with the latest metrics
+            metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
+            description = training_util.description_from_metrics(metrics)
+            train_generator_tqdm.set_description(description, refresh=False)
+            # Log parameter values to Tensorboard
+            if self._tensorboard.should_log_this_batch():
+                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
+                self._tensorboard.log_learning_rates(self.model, self.optimizer)
+                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
+                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})
+            if self._tensorboard.should_log_histograms_this_batch():
+                self._tensorboard.log_histograms(self.model, histogram_parameters)
+            if self._log_batch_size_period:
+                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
+                cumulative_batch_size += cur_batch
+                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
+                    average = cumulative_batch_size / batches_this_epoch
+                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
+                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch)
+                    self._tensorboard.add_train_scalar("mean_batch_size", average)
+            # Save model if needed.
+            if self._model_save_interval is not None and (
+                time.time() - last_save_time > self._model_save_interval
+            ):
+                last_save_time = time.time()
+                self._save_checkpoint(
+                    "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time)))
+                )
+        metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
+        metrics["cpu_memory_MB"] = peak_cpu_usage
+        for (gpu_num, memory) in gpu_usage:
+            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
+        return metrics
+    def _validation_loss(self) -> Tuple[float, int]:
+        """
+        Computes the validation loss. Returns it and the number of batches.
+        """
+        logger.info("Validating")
+        self.model.eval()
+        # Replace parameter values with the shadow values from the moving averages.
+        if self._moving_average is not None:
+            self._moving_average.assign_average_value()
+        if self._validation_iterator is not None:
+            val_iterator = self._validation_iterator
+        else:
+            val_iterator = self.iterator
+        num_gpus = len(self._cuda_devices)
+        raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False)
+        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
+        num_validation_batches = math.ceil(
+            val_iterator.get_num_batches(self._validation_data) / num_gpus
+        )
+        val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
+        batches_this_epoch = 0
+        val_loss = 0
+        for batch_group in val_generator_tqdm:
+            loss = self.batch_loss(batch_group, for_training=False)
+            if loss is not None:
+                # You shouldn't necessarily have to compute a loss for validation, so we allow for
+                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
+                # currently only used as the divisor for the loss function, so we can safely only
+                # count those batches for which we actually have a loss.  If this variable ever
+                # gets used for something else, we might need to change things around a bit.
+                batches_this_epoch += 1
+                val_loss += loss.detach().cpu().numpy()
+            # Update the description with the latest metrics
+            val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch)
+            description = training_util.description_from_metrics(val_metrics)
+            val_generator_tqdm.set_description(description, refresh=False)
+        # Now restore the original parameter values.
+        if self._moving_average is not None:
+            self._moving_average.restore()
+        return val_loss, batches_this_epoch
+    def train(self) -> Dict[str, Any]:
+        """
+        Trains the supplied model with the supplied parameters.
+        """
+        try:
+            epoch_counter = self._restore_checkpoint()
+        except RuntimeError:
+            traceback.print_exc()
+            raise ConfigurationError(
+                "Could not recover training from the checkpoint.  Did you mean to output to "
+                "a different serialization directory or delete the existing serialization "
+                "directory?"
+            )
+        training_util.enable_gradient_clipping(self.model, self._grad_clipping)
+        logger.info("Beginning training.")
+        train_metrics: Dict[str, float] = {}
+        val_metrics: Dict[str, float] = {}
+        this_epoch_val_metric: float = None
+        metrics: Dict[str, Any] = {}
+        epochs_trained = 0
+        training_start_time = time.time()
+        if self.cold_step_count > 0:
+            base_lr = self.optimizer.param_groups[0]['lr']
+            for param_group in self.optimizer.param_groups:
+                param_group['lr'] = self.cold_lr
+            self.model.text_field_embedder._token_embedders['bert'].set_weights(freeze=True)
+        metrics["best_epoch"] = self._metric_tracker.best_epoch
+        for key, value in self._metric_tracker.best_epoch_metrics.items():
+            metrics["best_validation_" + key] = value
+        for epoch in range(epoch_counter, self._num_epochs):
+            if epoch == self.cold_step_count and epoch != 0:
+                for param_group in self.optimizer.param_groups:
+                    param_group['lr'] = base_lr
+                self.model.text_field_embedder._token_embedders['bert'].set_weights(freeze=False)
+            epoch_start_time = time.time()
+            train_metrics = self._train_epoch(epoch)
+            # get peak of memory usage
+            if "cpu_memory_MB" in train_metrics:
+                metrics["peak_cpu_memory_MB"] = max(
+                    metrics.get("peak_cpu_memory_MB", 0), train_metrics["cpu_memory_MB"]
+                )
+            for key, value in train_metrics.items():
+                if key.startswith("gpu_"):
+                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value)
+            # clear cache before validation
+            torch.cuda.empty_cache()
+            if self._validation_data is not None:
+                with torch.no_grad():
+                    # We have a validation set, so compute all the metrics on it.
+                    val_loss, num_batches = self._validation_loss()
+                    val_metrics = training_util.get_metrics(
+                        self.model, val_loss, num_batches, reset=True
+                    )
+                    # Check validation metric for early stopping
+                    this_epoch_val_metric = val_metrics[self._validation_metric]
+                    self._metric_tracker.add_metric(this_epoch_val_metric)
+                    if self._metric_tracker.should_stop_early():
+                        logger.info("Ran out of patience.  Stopping training.")
+                        break
+            self._tensorboard.log_metrics(
+                train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1
+            )  # +1 because tensorboard doesn't like 0
+            # Create overall metrics dict
+            training_elapsed_time = time.time() - training_start_time
+            metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time))
+            metrics["training_start_epoch"] = epoch_counter
+            metrics["training_epochs"] = epochs_trained
+            metrics["epoch"] = epoch
+            for key, value in train_metrics.items():
+                metrics["training_" + key] = value
+            for key, value in val_metrics.items():
+                metrics["validation_" + key] = value
+            # if self.cold_step_count <= epoch:
+            self.scheduler.step(metrics['validation_loss'])
+            if self._metric_tracker.is_best_so_far():
+                # Update all the best_ metrics.
+                # (Otherwise they just stay the same as they were.)
+                metrics["best_epoch"] = epoch
+                for key, value in val_metrics.items():
+                    metrics["best_validation_" + key] = value
+                self._metric_tracker.best_epoch_metrics = val_metrics
+            if self._serialization_dir:
+                dump_metrics(
+                    os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics
+                )
+            # The Scheduler API is agnostic to whether your schedule requires a validation metric -
+            # if it doesn't, the validation metric passed here is ignored.
+            if self._learning_rate_scheduler:
+                self._learning_rate_scheduler.step(this_epoch_val_metric, epoch)
+            if self._momentum_scheduler:
+                self._momentum_scheduler.step(this_epoch_val_metric, epoch)
+            self._save_checkpoint(epoch)
+            epoch_elapsed_time = time.time() - epoch_start_time
+            logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time))
+            if epoch < self._num_epochs - 1:
+                training_elapsed_time = time.time() - training_start_time
+                estimated_time_remaining = training_elapsed_time * (
+                    (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1
+                )
+                formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining)))
+                logger.info("Estimated training time remaining: %s", formatted_time)
+            epochs_trained += 1
+        # make sure pending events are flushed to disk and files are closed properly
+        # self._tensorboard.close()
+        # Load the best model state before returning
+        best_model_state = self._checkpointer.best_model_state()
+        if best_model_state:
+            self.model.load_state_dict(best_model_state)
+        return metrics
+    def _save_checkpoint(self, epoch: Union[int, str]) -> None:
+        """
+        Saves a checkpoint of the model to self._serialization_dir.
+        Is a no-op if self._serialization_dir is None.
+        Parameters
+        ----------
+        epoch : Union[int, str], required.
+            The epoch of training.  If the checkpoint is saved in the middle
+            of an epoch, the parameter is a string with the epoch and timestamp.
+        """
+        # If moving averages are used for parameters, we save
+        # the moving average values into checkpoint, instead of the current values.
+        if self._moving_average is not None:
+            self._moving_average.assign_average_value()
+        # These are the training states we need to persist.
+        training_states = {
+            "metric_tracker": self._metric_tracker.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+            "batch_num_total": self._batch_num_total,
+        }
+        # If we have a learning rate or momentum scheduler, we should persist them too.
+        if self._learning_rate_scheduler is not None:
+            training_states["learning_rate_scheduler"] = self._learning_rate_scheduler.state_dict()
+        if self._momentum_scheduler is not None:
+            training_states["momentum_scheduler"] = self._momentum_scheduler.state_dict()
+        self._checkpointer.save_checkpoint(
+            model_state=self.model.state_dict(),
+            epoch=epoch,
+            training_states=training_states,
+            is_best_so_far=self._metric_tracker.is_best_so_far(),
+        )
+        # Restore the original values for parameters so that training will not be affected.
+        if self._moving_average is not None:
+            self._moving_average.restore()
+    def _restore_checkpoint(self) -> int:
+        """
+        Restores the model and training state from the last saved checkpoint.
+        This includes an epoch count and optimizer state, which is serialized separately
+        from model parameters. This function should only be used to continue training -
+        if you wish to load a model for inference/load parts of a model into a new
+        computation graph, you should use the native Pytorch functions:
+        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``
+        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
+        this function will do nothing and return 0.
+        Returns
+        -------
+        epoch: int
+            The epoch at which to resume training, which should be one after the epoch
+            in the saved training state.
+        """
+        model_state, training_state = self._checkpointer.restore_checkpoint()
+        if not training_state:
+            # No checkpoint to restore, start at 0
+            return 0
+        self.model.load_state_dict(model_state)
+        self.optimizer.load_state_dict(training_state["optimizer"])
+        if self._learning_rate_scheduler is not None \
+                and "learning_rate_scheduler" in training_state:
+            self._learning_rate_scheduler.load_state_dict(training_state["learning_rate_scheduler"])
+        if self._momentum_scheduler is not None and "momentum_scheduler" in training_state:
+            self._momentum_scheduler.load_state_dict(training_state["momentum_scheduler"])
+        training_util.move_optimizer_to_cuda(self.optimizer)
+        # Currently the ``training_state`` contains a serialized ``MetricTracker``.
+        if "metric_tracker" in training_state:
+            self._metric_tracker.load_state_dict(training_state["metric_tracker"])
+        # It used to be the case that we tracked ``val_metric_per_epoch``.
+        elif "val_metric_per_epoch" in training_state:
+            self._metric_tracker.clear()
+            self._metric_tracker.add_metrics(training_state["val_metric_per_epoch"])
+        # And before that we didn't track anything.
+        else:
+            self._metric_tracker.clear()
+        if isinstance(training_state["epoch"], int):
+            epoch_to_return = training_state["epoch"] + 1
+        else:
+            epoch_to_return = int(training_state["epoch"].split(".")[0]) + 1
+        # For older checkpoints with batch_num_total missing, default to old behavior where
+        # it is unchanged.
+        batch_num_total = training_state.get("batch_num_total")
+        if batch_num_total is not None:
+            self._batch_num_total = batch_num_total
+        return epoch_to_return
+    # Requires custom from_params.
+    @classmethod
+    def from_params(  # type: ignore
+        cls,
+        model: Model,
+        serialization_dir: str,
+        iterator: DataIterator,
+        train_data: Iterable[Instance],
+        validation_data: Optional[Iterable[Instance]],
+        params: Params,
+        validation_iterator: DataIterator = None,
+    ) -> "Trainer":
+        patience = params.pop_int("patience", None)
+        validation_metric = params.pop("validation_metric", "-loss")
+        shuffle = params.pop_bool("shuffle", True)
+        num_epochs = params.pop_int("num_epochs", 20)
+        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
+        grad_norm = params.pop_float("grad_norm", None)
+        grad_clipping = params.pop_float("grad_clipping", None)
+        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
+        momentum_scheduler_params = params.pop("momentum_scheduler", None)
+        if isinstance(cuda_device, list):
+            model_device = cuda_device[0]
+        else:
+            model_device = cuda_device
+        if model_device >= 0:
+            # Moving model to GPU here so that the optimizer state gets constructed on
+            # the right device.
+            model = model.cuda(model_device)
+        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
+        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
+        if "moving_average" in params:
+            moving_average = MovingAverage.from_params(
+                params.pop("moving_average"), parameters=parameters
+            )
+        else:
+            moving_average = None
+        if lr_scheduler_params:
+            lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
+        else:
+            lr_scheduler = None
+        if momentum_scheduler_params:
+            momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params)
+        else:
+            momentum_scheduler = None
+        if "checkpointer" in params:
+            if "keep_serialized_model_every_num_seconds" in params \
+                    or "num_serialized_models_to_keep" in params:
+                raise ConfigurationError(
+                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
+                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
+                    " but the passed config uses both methods."
+                )
+            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
+        else:
+            num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
+            keep_serialized_model_every_num_seconds = params.pop_int(
+                "keep_serialized_model_every_num_seconds", None
+            )
+            checkpointer = Checkpointer(
+                serialization_dir=serialization_dir,
+                num_serialized_models_to_keep=num_serialized_models_to_keep,
+                keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
+            )
+        model_save_interval = params.pop_float("model_save_interval", None)
+        summary_interval = params.pop_int("summary_interval", 100)
+        histogram_interval = params.pop_int("histogram_interval", None)
+        should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True)
+        should_log_learning_rate = params.pop_bool("should_log_learning_rate", False)
+        log_batch_size_period = params.pop_int("log_batch_size_period", None)
+        params.assert_empty(cls.__name__)
+        return cls(
+            model,
+            optimizer,
+            iterator,
+            train_data,
+            validation_data,
+            patience=patience,
+            validation_metric=validation_metric,
+            validation_iterator=validation_iterator,
+            shuffle=shuffle,
+            num_epochs=num_epochs,
+            serialization_dir=serialization_dir,
+            cuda_device=cuda_device,
+            grad_norm=grad_norm,
+            grad_clipping=grad_clipping,
+            learning_rate_scheduler=lr_scheduler,
+            momentum_scheduler=momentum_scheduler,
+            checkpointer=checkpointer,
+            model_save_interval=model_save_interval,
+            summary_interval=summary_interval,
+            histogram_interval=histogram_interval,
+            should_log_parameter_statistics=should_log_parameter_statistics,
+            should_log_learning_rate=should_log_learning_rate,
+            log_batch_size_period=log_batch_size_period,
+            moving_average=moving_average,
+        )