versae commited on Jan 19, 2022

Commit

9c3de9e

1 Parent(s): da38e4c

Saving weights and logs of step 1000

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
config.json +26 -0
eval_results.json +5 -0
events.out.tfevents.1642203685.t1v-n-eedfb410-w-0.10537.0.v2 +3 -0
events.out.tfevents.1642204242.t1v-n-eedfb410-w-0.profile-empty +3 -0
events.out.tfevents.1642608722.t1v-n-eedfb410-w-0.1271442.0.v2 +3 -0
flax_model.msgpack +3 -0
merges.txt +0 -0
run_mlm_flax.py +815 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
train.128.sh +25 -0
train.512.sh +26 -0
vocab.json +0 -0
wandb/debug-internal.log +1 -0
wandb/debug.log +1 -0
wandb/latest-run +1 -0
wandb/run-20220114_212855-32qdb4k5/files/code/run_mlm_flax.py +815 -0
wandb/run-20220114_212855-32qdb4k5/files/config.yaml +152 -0
wandb/run-20220114_212855-32qdb4k5/files/diff.patch +0 -0
wandb/run-20220114_212855-32qdb4k5/files/output.log +43 -0
wandb/run-20220114_212855-32qdb4k5/files/requirements.txt +122 -0
wandb/run-20220114_212855-32qdb4k5/files/wandb-metadata.json +47 -0
wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json +1 -0
wandb/run-20220114_212855-32qdb4k5/logs/debug-internal.log +189 -0
wandb/run-20220114_212855-32qdb4k5/logs/debug.log +150 -0
wandb/run-20220114_212855-32qdb4k5/run-32qdb4k5.wandb +0 -0
wandb/run-20220114_221533-24dma583/files/code/run_mlm_flax.py +815 -0
wandb/run-20220114_221533-24dma583/files/config.yaml +152 -0
wandb/run-20220114_221533-24dma583/files/diff.patch +0 -0
wandb/run-20220114_221533-24dma583/files/output.log +43 -0
wandb/run-20220114_221533-24dma583/files/requirements.txt +122 -0
wandb/run-20220114_221533-24dma583/files/wandb-metadata.json +47 -0
wandb/run-20220114_221533-24dma583/files/wandb-summary.json +1 -0
wandb/run-20220114_221533-24dma583/logs/debug-internal.log +187 -0
wandb/run-20220114_221533-24dma583/logs/debug.log +141 -0
wandb/run-20220114_221533-24dma583/run-24dma583.wandb +0 -0
wandb/run-20220114_234119-1zya86oe/files/code/run_mlm_flax.py +815 -0
wandb/run-20220114_234119-1zya86oe/files/config.yaml +152 -0
wandb/run-20220114_234119-1zya86oe/files/diff.patch +0 -0
wandb/run-20220114_234119-1zya86oe/files/output.log +3 -0
wandb/run-20220114_234119-1zya86oe/files/requirements.txt +122 -0
wandb/run-20220114_234119-1zya86oe/files/wandb-metadata.json +47 -0
wandb/run-20220114_234119-1zya86oe/files/wandb-summary.json +1 -0
wandb/run-20220114_234119-1zya86oe/logs/debug-internal.log +3 -0
wandb/run-20220114_234119-1zya86oe/logs/debug.log +168 -0
wandb/run-20220114_234119-1zya86oe/run-1zya86oe.wandb +3 -0
wandb/run-20220119_161158-274aad95/files/code/run_mlm_flax.py +815 -0
wandb/run-20220119_161158-274aad95/files/config.yaml +147 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20220114_234119-1zya86oe/files/output.log filter=lfs diff=lfs merge=lfs -text
+wandb/run-20220114_234119-1zya86oe/logs/debug-internal.log filter=lfs diff=lfs merge=lfs -text
+wandb/run-20220114_234119-1zya86oe/run-1zya86oe.wandb filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "./",
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.16.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "eval_accuracy": 0.6885889057979672,
+    "eval_loss": 1.430497475427537,
+    "eval_perplexity": 4.180778509252052
+}

events.out.tfevents.1642203685.t1v-n-eedfb410-w-0.10537.0.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d263549b78d9bb80e115caaffac01f4b141c31e954b64fb7a2ee59c5e4138641
+size 19208160

events.out.tfevents.1642204242.t1v-n-eedfb410-w-0.profile-empty ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac614ecef2709e4ed2bc443ce4ade10122a22097363c5eb86dfadf8e74fa7c5
+size 40

events.out.tfevents.1642608722.t1v-n-eedfb410-w-0.1271442.0.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d194308b26bfc127e2d922f51dc9ca9914119f3b530144d116457498072ad97
+size 147136

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a12b2311a330a0e74118143c453bf8ad2490d7109372302e7cf9b878f65e181
+size 498796983

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1,815 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=fill-mask
+"""
+import json
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level=logging.INFO,
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='versae',
+                project='roberta-base-ncc',
+                sync_tensorboard=False,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
+        eval_normalizer = eval_metrics.pop("normalizer")
+        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+        try:
+            perplexity = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        eval_metrics["perplexity"] = perplexity
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "trim_offsets": true, "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "RobertaTokenizer"}

train.128.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+python run_mlm_flax.py \
+    --output_dir="./" \
+    --model_type="roberta" \
+    --config_name="roberta-base" \
+    --tokenizer_name="NbAiLab/nb-roberta-base" \
+    --dataset_name="NbAiLab/NCC" \
+    --max_seq_length="128" \
+    --weight_decay="0.01" \
+    --per_device_train_batch_size="232" \
+    --per_device_eval_batch_size="232" \
+    --pad_to_max_length \
+    --learning_rate="6e-4" \
+    --warmup_steps="10000" \
+    --overwrite_output_dir \
+    --num_train_epochs="3" \
+    --adam_beta1="0.9" \
+    --adam_beta2="0.98" \
+    --adam_epsilon="1e-6" \
+    --logging_steps="1000" \
+    --save_steps="1000" \
+    --eval_steps="1000" \
+    --do_train \
+    --do_eval \
+    --dtype="bfloat16" \
+    --push_to_hub

train.512.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+python run_mlm_flax.py \
+    --output_dir="./" \
+    --model_type="roberta" \
+    --model_name_or_path="./" \
+    --config_name="./" \
+    --tokenizer_name="./" \
+    --dataset_name="NbAiLab/NCC" \
+    --max_seq_length="512" \
+    --weight_decay="0.01" \
+    --per_device_train_batch_size="46" \
+    --per_device_eval_batch_size="46" \
+    --pad_to_max_length \
+    --learning_rate="6e-4" \
+    --warmup_steps="1000" \
+    --overwrite_output_dir \
+    --num_train_epochs="3" \
+    --adam_beta1="0.9" \
+    --adam_beta2="0.98" \
+    --adam_epsilon="1e-6" \
+    --logging_steps="1000" \
+    --save_steps="1000" \
+    --eval_steps="1000" \
+    --do_train \
+    --do_eval \
+    --dtype="bfloat16" \
+    --push_to_hub

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20220119_161158-274aad95/logs/debug-internal.log

wandb/debug.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20220119_161158-274aad95/logs/debug.log

wandb/latest-run ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20220119_161158-274aad95

wandb/run-20220114_212855-32qdb4k5/files/code/run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1,815 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=fill-mask
+"""
+import json
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level=logging.INFO,
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='versae',
+                project='roberta-base-ncc',
+                sync_tensorboard=False,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
+        eval_normalizer = eval_metrics.pop("normalizer")
+        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+        try:
+            perplexity = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        eval_metrics["perplexity"] = perplexity
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

wandb/run-20220114_212855-32qdb4k5/files/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.12.9
+    code_path: code/run_mlm_flax.py
+    framework: huggingface
+    huggingface_version: 4.16.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    start_time: 1642195735
+    t:
+      1:
+      - 2
+      - 3
+      - 11
+      - 12
+      2:
+      - 2
+      - 3
+      - 11
+      - 12
+      4: 3.8.10
+      5: 0.12.9
+      6: 4.16.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-06
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: roberta-base
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: NbAiLab/NCC
+do_eval:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: bfloat16
+eval_steps:
+  desc: null
+  value: 1000
+hub_model_id:
+  desc: null
+  value: null
+hub_token:
+  desc: null
+  value: null
+learning_rate:
+  desc: null
+  value: 0.0006
+line_by_line:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 1000
+max_seq_length:
+  desc: null
+  value: 128
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: roberta
+num_train_epochs:
+  desc: null
+  value: 3.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: true
+per_device_eval_batch_size:
+  desc: null
+  value: 250
+per_device_train_batch_size:
+  desc: null
+  value: 250
+preprocessing_num_workers:
+  desc: null
+  value: null
+push_to_hub:
+  desc: null
+  value: true
+save_steps:
+  desc: null
+  value: 1000
+seed:
+  desc: null
+  value: 42
+tokenizer_name:
+  desc: null
+  value: NbAiLab/nb-roberta-base
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_steps:
+  desc: null
+  value: 10000
+weight_decay:
+  desc: null
+  value: 0.01

wandb/run-20220114_212855-32qdb4k5/files/diff.patch ADDED Viewed

File without changes

wandb/run-20220114_212855-32qdb4k5/files/output.log ADDED Viewed

	@@ -0,0 +1,43 @@

+2022-01-14 21:29:01.798913: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
+2022-01-14 21:29:01.798960: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
+Epoch ... (1/3):   0%|                                                                                                                                                                                                                                              | 0/3 [00:00<?, ?it/s]
+Training...:   0%|                                                                                                                                                                                                                                              | 0/39919 [02:17<?, ?it/s]
+Epoch ... (1/3):   0%|                                                                                                                                                                                                                                              | 0/3 [03:05<?, ?it/s]
+Traceback (most recent call last):
+  File "run_mlm_flax.py", line 815, in <module>
+    main()
+  File "run_mlm_flax.py", line 723, in main
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 162, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/api.py", line 2058, in cache_miss
+    out_tree, out_flat = f_pmapped_(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/api.py", line 1934, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1727, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1652, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1730, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 633, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 778, in xla_pmap_impl
+    return compiled_fun(*args)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/profiler.py", line 206, in wrapper
+    return func(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1502, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: RESOURCE_EXHAUSTED: Attempting to reserve 12.83G at the bottom of memory. That was not possible. There are 13.18G free, 0B reserved, and 12.71G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).
+The stack trace below excludes JAX-internal frames.
+The preceding is the original exception that occurred, unmodified.
+--------------------
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "run_mlm_flax.py", line 815, in <module>
+    main()
+  File "run_mlm_flax.py", line 723, in main
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1502, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+RuntimeError: RESOURCE_EXHAUSTED: Attempting to reserve 12.83G at the bottom of memory. That was not possible. There are 13.18G free, 0B reserved, and 12.71G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).

wandb/run-20220114_212855-32qdb4k5/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,122 @@

+absl-py==1.0.0
+aiohttp==3.8.1
+aiosignal==1.2.0
+astunparse==1.6.3
+async-timeout==4.0.2
+attrs==21.4.0
+backcall==0.2.0
+cachetools==4.2.4
+certifi==2021.10.8
+charset-normalizer==2.0.10
+chex==0.1.0
+click==8.0.3
+clu==0.0.6
+configparser==5.2.0
+contextlib2==21.6.0
+cycler==0.11.0
+datasets==1.17.1.dev0
+decorator==5.1.0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.4.2
+flatbuffers==2.0
+flax==0.3.6
+fonttools==4.28.5
+frozenlist==1.2.0
+fsspec==2021.11.1
+future==0.18.2
+gast==0.4.0
+gitdb==4.0.9
+gitpython==3.1.26
+google-auth-oauthlib==0.4.6
+google-auth==2.3.3
+google-pasta==0.2.0
+googleapis-common-protos==1.54.0
+grpcio==1.43.0
+h5py==3.6.0
+huggingface-hub==0.2.1
+idna==3.3
+importlib-metadata==4.10.0
+importlib-resources==5.4.0
+ipython==7.31.0
+jax==0.2.26
+jaxlib==0.1.75
+jedi==0.18.1
+joblib==1.1.0
+keras-preprocessing==1.1.2
+keras==2.7.0
+kiwisolver==1.3.2
+libclang==12.0.0
+libtpu-nightly==0.1.dev20211208
+markdown==3.3.6
+matplotlib-inline==0.1.3
+matplotlib==3.5.1
+ml-collections==0.1.0
+msgpack==1.0.3
+multidict==5.2.0
+multiprocess==0.70.12.2
+numpy==1.22.0
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.1.0
+packaging==21.3
+pandas==1.3.5
+parso==0.8.3
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==9.0.0
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+prompt-toolkit==3.0.24
+protobuf==3.19.1
+psutil==5.9.0
+ptyprocess==0.7.0
+pyarrow==6.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pygments==2.11.1
+pyparsing==3.0.6
+python-dateutil==2.8.2
+pytz==2021.3
+pyyaml==6.0
+regex==2021.11.10
+requests-oauthlib==1.3.0
+requests==2.27.0
+rsa==4.8
+sacremoses==0.0.46
+scipy==1.7.3
+sentry-sdk==1.5.2
+setuptools==44.0.0
+shortuuid==1.0.8
+six==1.16.0
+smmap==5.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.7.0
+tensorflow-cpu==2.7.0
+tensorflow-datasets==4.4.0
+tensorflow-estimator==2.7.0
+tensorflow-io-gcs-filesystem==0.23.1
+tensorflow-metadata==1.5.0
+tensorflow==2.7.0
+termcolor==1.1.0
+tokenizers==0.11.2
+toolz==0.11.2
+tqdm==4.62.3
+traitlets==5.1.1
+transformers==4.16.0.dev0
+typing-extensions==3.10.0.2
+urllib3==1.26.7
+wandb==0.12.9
+wcwidth==0.2.5
+werkzeug==2.0.2
+wheel==0.37.1
+wrapt==1.13.3
+xxhash==2.0.2
+yarl==1.7.2
+yaspin==2.1.0
+zipp==3.7.0

wandb/run-20220114_212855-32qdb4k5/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2022-01-14T21:28:58.974844",
+    "startedAt": "2022-01-14T21:28:55.397355",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=./",
+        "--model_type=roberta",
+        "--config_name=roberta-base",
+        "--tokenizer_name=NbAiLab/nb-roberta-base",
+        "--dataset_name=NbAiLab/NCC",
+        "--max_seq_length=128",
+        "--weight_decay=0.01",
+        "--per_device_train_batch_size=250",
+        "--per_device_eval_batch_size=250",
+        "--pad_to_max_length",
+        "--learning_rate=6e-4",
+        "--warmup_steps=10000",
+        "--overwrite_output_dir",
+        "--num_train_epochs=3",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--adam_epsilon=1e-6",
+        "--logging_steps=1000",
+        "--save_steps=1000",
+        "--eval_steps=1000",
+        "--do_train",
+        "--do_eval",
+        "--dtype=bfloat16",
+        "--push_to_hub"
+    ],
+    "state": "running",
+    "program": "run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/versae/roberta-base-ncc",
+        "commit": "502df078f73cf93ca9380fcac1c9b9c7598a445f"
+    },
+    "email": "versae@gmail.com",
+    "root": "/data/roberta-base-ncc",
+    "host": "t1v-n-eedfb410-w-0",
+    "username": "javierr",
+    "executable": "/data/flax/bin/python"
+}

wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 200}}

wandb/run-20220114_212855-32qdb4k5/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,189 @@

+2022-01-14 21:28:56,265 INFO    MainThread:8253 [internal.py:wandb_internal():87] W&B internal server running at pid: 8253, started at: 2022-01-14 21:28:56.265129
+2022-01-14 21:28:56,268 DEBUG   SenderThread:8253 [sender.py:send():234] send: header
+2022-01-14 21:28:56,268 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: check_version
+2022-01-14 21:28:56,268 INFO    WriterThread:8253 [datastore.py:open_for_write():77] open: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/run-32qdb4k5.wandb
+2022-01-14 21:28:56,268 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: check_version
+2022-01-14 21:28:56,352 DEBUG   SenderThread:8253 [sender.py:send():234] send: run
+2022-01-14 21:28:56,515 INFO    SenderThread:8253 [dir_watcher.py:__init__():169] watching files in: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files
+2022-01-14 21:28:56,515 INFO    SenderThread:8253 [sender.py:_start_run_threads():804] run started: 32qdb4k5 with start time 1642195735
+2022-01-14 21:28:56,515 DEBUG   SenderThread:8253 [sender.py:send():234] send: summary
+2022-01-14 21:28:56,515 INFO    SenderThread:8253 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 21:28:56,515 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: run_start
+2022-01-14 21:28:57,561 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json
+2022-01-14 21:28:58,974 DEBUG   HandlerThread:8253 [meta.py:__init__():40] meta init
+2022-01-14 21:28:58,974 DEBUG   HandlerThread:8253 [meta.py:__init__():54] meta init done
+2022-01-14 21:28:58,974 DEBUG   HandlerThread:8253 [meta.py:probe():214] probe
+2022-01-14 21:28:58,975 DEBUG   HandlerThread:8253 [meta.py:_setup_git():204] setup git
+2022-01-14 21:28:59,006 DEBUG   HandlerThread:8253 [meta.py:_setup_git():211] setup git done
+2022-01-14 21:28:59,006 DEBUG   HandlerThread:8253 [meta.py:_save_code():92] save code
+2022-01-14 21:28:59,018 DEBUG   HandlerThread:8253 [meta.py:_save_code():113] save code done
+2022-01-14 21:28:59,018 DEBUG   HandlerThread:8253 [meta.py:_save_patches():130] save patches
+2022-01-14 21:28:59,561 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/code/run_mlm_flax.py
+2022-01-14 21:28:59,562 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/code
+2022-01-14 21:29:01,562 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:29:03,563 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:29:03,563 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/diff.patch
+2022-01-14 21:29:08,274 ERROR   HandlerThread:8253 [meta.py:_save_patches():171] Error generating diff: Command '['git', 'diff', '--submodule=diff', 'HEAD']' timed out after 5 seconds
+2022-01-14 21:29:08,274 DEBUG   HandlerThread:8253 [meta.py:_save_patches():172] save patches done
+2022-01-14 21:29:08,274 DEBUG   HandlerThread:8253 [meta.py:_save_pip():58] save pip
+2022-01-14 21:29:08,275 DEBUG   HandlerThread:8253 [meta.py:_save_pip():72] save pip done
+2022-01-14 21:29:08,275 DEBUG   HandlerThread:8253 [meta.py:probe():252] probe done
+2022-01-14 21:29:08,283 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:29:08,284 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:29:08,566 DEBUG   SenderThread:8253 [sender.py:send():234] send: config
+2022-01-14 21:29:08,566 DEBUG   SenderThread:8253 [sender.py:send():234] send: config
+2022-01-14 21:29:08,566 DEBUG   SenderThread:8253 [sender.py:send():234] send: config
+2022-01-14 21:29:08,567 DEBUG   SenderThread:8253 [sender.py:send():234] send: files
+2022-01-14 21:29:08,567 INFO    SenderThread:8253 [sender.py:_save_file():939] saving file wandb-metadata.json with policy now
+2022-01-14 21:29:08,567 INFO    SenderThread:8253 [sender.py:_save_file():939] saving file code/run_mlm_flax.py with policy now
+2022-01-14 21:29:08,571 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/requirements.txt
+2022-01-14 21:29:08,571 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-metadata.json
+2022-01-14 21:29:09,032 INFO    Thread-12 :8253 [upload_job.py:push():137] Uploaded file /tmp/tmpdg54qv_0wandb/w1tibuxq-code/run_mlm_flax.py
+2022-01-14 21:29:09,069 INFO    Thread-11 :8253 [upload_job.py:push():137] Uploaded file /tmp/tmpdg54qv_0wandb/35h4ryp5-wandb-metadata.json
+2022-01-14 21:29:09,571 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:29:21,520 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:29:21,521 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:29:27,049 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:29:27,579 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/config.yaml
+2022-01-14 21:29:36,656 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:29:36,657 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:29:51,794 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:29:51,795 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:29:57,119 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:29:58,591 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:30:06,945 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:30:06,945 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:30:22,126 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:30:22,127 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:30:27,189 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:30:37,330 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:30:37,330 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:30:52,532 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:30:52,532 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:30:57,257 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:31:07,691 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:31:07,692 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:31:22,944 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:31:22,945 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:31:27,323 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:31:38,085 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:31:38,086 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:31:53,231 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:31:53,231 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:31:57,395 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:32:08,366 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 21:32:08,367 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 21:32:16,649 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:32:16,893 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:16,893 DEBUG   SenderThread:8253 [sender.py:send():234] send: telemetry
+2022-01-14 21:32:16,893 DEBUG   SenderThread:8253 [sender.py:send():234] send: exit
+2022-01-14 21:32:16,893 INFO    SenderThread:8253 [sender.py:send_exit():366] handling exit code: 1
+2022-01-14 21:32:16,894 INFO    SenderThread:8253 [sender.py:send_exit():368] handling runtime: 200
+2022-01-14 21:32:16,894 INFO    SenderThread:8253 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 21:32:16,894 INFO    SenderThread:8253 [sender.py:send_exit():374] send defer
+2022-01-14 21:32:16,894 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:16,895 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:16,895 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 0
+2022-01-14 21:32:16,895 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:16,895 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 0
+2022-01-14 21:32:16,895 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 1
+2022-01-14 21:32:16,896 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:16,896 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 1
+2022-01-14 21:32:16,941 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:16,941 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 1
+2022-01-14 21:32:16,941 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 2
+2022-01-14 21:32:16,941 DEBUG   SenderThread:8253 [sender.py:send():234] send: stats
+2022-01-14 21:32:16,942 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:16,942 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 2
+2022-01-14 21:32:16,942 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:16,942 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 2
+2022-01-14 21:32:16,942 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 3
+2022-01-14 21:32:16,942 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:16,942 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 3
+2022-01-14 21:32:16,943 DEBUG   SenderThread:8253 [sender.py:send():234] send: summary
+2022-01-14 21:32:16,943 INFO    SenderThread:8253 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 21:32:16,943 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:16,943 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 3
+2022-01-14 21:32:16,943 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 4
+2022-01-14 21:32:16,943 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:16,943 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 4
+2022-01-14 21:32:16,944 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:16,944 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 4
+2022-01-14 21:32:16,997 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:17,650 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:32:17,650 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json
+2022-01-14 21:32:17,685 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 5
+2022-01-14 21:32:17,686 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:17,686 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:17,686 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 5
+2022-01-14 21:32:17,686 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:17,686 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 5
+2022-01-14 21:32:17,687 INFO    SenderThread:8253 [dir_watcher.py:finish():283] shutting down directory watcher
+2022-01-14 21:32:17,787 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:18,650 INFO    Thread-8  :8253 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/config.yaml
+2022-01-14 21:32:18,651 INFO    SenderThread:8253 [dir_watcher.py:finish():313] scan: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files
+2022-01-14 21:32:18,651 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/config.yaml config.yaml
+2022-01-14 21:32:18,651 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/diff.patch diff.patch
+2022-01-14 21:32:18,651 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/requirements.txt requirements.txt
+2022-01-14 21:32:18,652 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log output.log
+2022-01-14 21:32:18,652 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json wandb-summary.json
+2022-01-14 21:32:18,652 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-metadata.json wandb-metadata.json
+2022-01-14 21:32:18,656 INFO    SenderThread:8253 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/code/run_mlm_flax.py code/run_mlm_flax.py
+2022-01-14 21:32:18,656 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 6
+2022-01-14 21:32:18,656 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:18,657 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:18,657 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 6
+2022-01-14 21:32:18,662 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:18,665 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 6
+2022-01-14 21:32:18,665 INFO    SenderThread:8253 [file_pusher.py:finish():177] shutting down file pusher
+2022-01-14 21:32:18,757 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:18,758 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:18,859 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:18,860 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:18,961 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:18,962 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,063 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,064 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,139 INFO    Thread-15 :8253 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/output.log
+2022-01-14 21:32:19,148 INFO    Thread-14 :8253 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/requirements.txt
+2022-01-14 21:32:19,165 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,165 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,171 INFO    Thread-13 :8253 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/config.yaml
+2022-01-14 21:32:19,267 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,267 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,288 INFO    Thread-16 :8253 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/files/wandb-summary.json
+2022-01-14 21:32:19,370 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,370 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,472 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,472 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,489 INFO    Thread-7  :8253 [sender.py:transition_state():387] send defer: 7
+2022-01-14 21:32:19,489 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:19,490 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 7
+2022-01-14 21:32:19,490 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:19,490 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 7
+2022-01-14 21:32:19,573 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:19,915 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 8
+2022-01-14 21:32:19,916 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:19,916 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:19,916 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 8
+2022-01-14 21:32:19,917 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:19,917 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 8
+2022-01-14 21:32:19,917 INFO    SenderThread:8253 [sender.py:transition_state():387] send defer: 9
+2022-01-14 21:32:19,917 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 21:32:19,917 INFO    HandlerThread:8253 [handler.py:handle_request_defer():147] handle defer: 9
+2022-01-14 21:32:19,918 DEBUG   SenderThread:8253 [sender.py:send():234] send: final
+2022-01-14 21:32:19,918 DEBUG   SenderThread:8253 [sender.py:send():234] send: footer
+2022-01-14 21:32:19,918 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: defer
+2022-01-14 21:32:19,918 INFO    SenderThread:8253 [sender.py:send_request_defer():383] handle sender defer: 9
+2022-01-14 21:32:20,017 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 21:32:20,018 DEBUG   SenderThread:8253 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 21:32:20,018 INFO    SenderThread:8253 [file_pusher.py:join():182] waiting for file pusher
+2022-01-14 21:32:20,278 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: get_summary
+2022-01-14 21:32:20,278 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: sampled_history
+2022-01-14 21:32:20,279 DEBUG   HandlerThread:8253 [handler.py:handle_request():130] handle_request: shutdown
+2022-01-14 21:32:20,279 INFO    HandlerThread:8253 [handler.py:finish():731] shutting down handler
+2022-01-14 21:32:20,918 INFO    WriterThread:8253 [datastore.py:close():281] close: /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/run-32qdb4k5.wandb
+2022-01-14 21:32:21,277 INFO    SenderThread:8253 [sender.py:finish():1070] shutting down sender
+2022-01-14 21:32:21,277 INFO    SenderThread:8253 [file_pusher.py:finish():177] shutting down file pusher
+2022-01-14 21:32:21,277 INFO    SenderThread:8253 [file_pusher.py:join():182] waiting for file pusher
+2022-01-14 21:32:21,279 INFO    MainThread:8253 [internal.py:handle_exit():77] Internal process exited

wandb/run-20220114_212855-32qdb4k5/logs/debug.log ADDED Viewed

	@@ -0,0 +1,150 @@

+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_setup.py:_flush():71] setting env: {}
+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_setup.py:_flush():71] setting login settings: {}
+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_init.py:_log_setup():371] Logging user logs to /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/logs/debug.log
+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_init.py:_log_setup():372] Logging internal logs to /data/roberta-base-ncc/wandb/run-20220114_212855-32qdb4k5/logs/debug-internal.log
+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_init.py:init():404] calling init triggers
+2022-01-14 21:28:55,408 INFO    MainThread:5000 [wandb_init.py:init():409] wandb.init called with sweep_config: {}
+config: {}
+2022-01-14 21:28:55,409 INFO    MainThread:5000 [wandb_init.py:init():460] starting backend
+2022-01-14 21:28:55,409 INFO    MainThread:5000 [backend.py:_multiprocessing_setup():99] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2022-01-14 21:28:55,461 INFO    MainThread:5000 [backend.py:ensure_launched():216] starting backend process...
+2022-01-14 21:28:55,488 INFO    MainThread:5000 [backend.py:ensure_launched():221] started backend process with pid: 8253
+2022-01-14 21:28:55,490 INFO    MainThread:5000 [wandb_init.py:init():469] backend started and connected
+2022-01-14 21:28:55,501 INFO    MainThread:5000 [wandb_init.py:init():533] updated telemetry
+2022-01-14 21:28:55,563 INFO    MainThread:5000 [wandb_init.py:init():563] communicating current version
+2022-01-14 21:28:56,351 INFO    MainThread:5000 [wandb_init.py:init():568] got version response
+2022-01-14 21:28:56,351 INFO    MainThread:5000 [wandb_init.py:init():578] communicating run to backend with 30 second timeout
+2022-01-14 21:28:56,515 INFO    MainThread:5000 [wandb_init.py:init():606] starting run threads in backend
+2022-01-14 21:29:01,520 INFO    MainThread:5000 [wandb_run.py:_console_start():1810] atexit reg
+2022-01-14 21:29:01,520 INFO    MainThread:5000 [wandb_run.py:_redirect():1684] redirect: SettingsConsole.REDIRECT
+2022-01-14 21:29:01,520 INFO    MainThread:5000 [wandb_run.py:_redirect():1689] Redirecting console.
+2022-01-14 21:29:01,523 INFO    MainThread:5000 [wandb_run.py:_redirect():1745] Redirects installed.
+2022-01-14 21:29:01,523 INFO    MainThread:5000 [wandb_init.py:init():633] run started, returning control to user process
+2022-01-14 21:29:01,523 INFO    MainThread:5000 [wandb_run.py:_config_callback():956] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'per_device_train_batch_size': 250, 'per_device_eval_batch_size': 250, 'learning_rate': 0.0006, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-06, 'adafactor': False, 'num_train_epochs': 3.0, 'warmup_steps': 10000, 'logging_steps': 1000, 'save_steps': 1000, 'eval_steps': 1000, 'seed': 42, 'push_to_hub': True, 'hub_model_id': None, 'hub_token': None}
+2022-01-14 21:29:01,524 INFO    MainThread:5000 [wandb_run.py:_config_callback():956] config_cb None None {'model_name_or_path': None, 'model_type': 'roberta', 'config_name': 'roberta-base', 'tokenizer_name': 'NbAiLab/nb-roberta-base', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2022-01-14 21:29:01,524 INFO    MainThread:5000 [wandb_run.py:_config_callback():956] config_cb None None {'dataset_name': 'NbAiLab/NCC', 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 128, 'preprocessing_num_workers': None, 'mlm_probability': 0.15, 'pad_to_max_length': True, 'line_by_line': False}
+2022-01-14 21:32:14,189 INFO    MainThread:5000 [wandb_run.py:_atexit_cleanup():1780] got exitcode: 1
+2022-01-14 21:32:14,192 INFO    MainThread:5000 [wandb_run.py:_restore():1752] restore
+2022-01-14 21:32:16,895 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-14 21:32:17,686 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-14 21:32:18,657 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 45931
+}
+2022-01-14 21:32:18,759 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 45931
+}
+2022-01-14 21:32:18,861 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45903
+  total_bytes: 45931
+}
+2022-01-14 21:32:18,962 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45903
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,064 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,166 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,269 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,371 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,473 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:19,917 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+2022-01-14 21:32:20,277 INFO    MainThread:5000 [wandb_run.py:_wait_for_finish():1912] got exit ret: done: true
+exit_result {
+}
+file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45931
+  total_bytes: 45931
+}
+local_info {
+}
+2022-01-14 21:32:23,445 INFO    MainThread:5000 [wandb_run.py:_append_files():2180] logging synced files

wandb/run-20220114_212855-32qdb4k5/run-32qdb4k5.wandb ADDED Viewed

Binary file (7.7 kB). View file

wandb/run-20220114_221533-24dma583/files/code/run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1,815 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=fill-mask
+"""
+import json
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level=logging.INFO,
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='versae',
+                project='roberta-base-ncc',
+                sync_tensorboard=False,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
+        eval_normalizer = eval_metrics.pop("normalizer")
+        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+        try:
+            perplexity = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        eval_metrics["perplexity"] = perplexity
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

wandb/run-20220114_221533-24dma583/files/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.12.9
+    code_path: code/run_mlm_flax.py
+    framework: huggingface
+    huggingface_version: 4.16.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    start_time: 1642198533
+    t:
+      1:
+      - 2
+      - 3
+      - 11
+      - 12
+      2:
+      - 2
+      - 3
+      - 11
+      - 12
+      4: 3.8.10
+      5: 0.12.9
+      6: 4.16.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-06
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: roberta-base
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: NbAiLab/NCC
+do_eval:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: bfloat16
+eval_steps:
+  desc: null
+  value: 1000
+hub_model_id:
+  desc: null
+  value: null
+hub_token:
+  desc: null
+  value: null
+learning_rate:
+  desc: null
+  value: 0.0006
+line_by_line:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 1000
+max_seq_length:
+  desc: null
+  value: 128
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: roberta
+num_train_epochs:
+  desc: null
+  value: 3.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: true
+per_device_eval_batch_size:
+  desc: null
+  value: 250
+per_device_train_batch_size:
+  desc: null
+  value: 250
+preprocessing_num_workers:
+  desc: null
+  value: null
+push_to_hub:
+  desc: null
+  value: true
+save_steps:
+  desc: null
+  value: 1000
+seed:
+  desc: null
+  value: 42
+tokenizer_name:
+  desc: null
+  value: NbAiLab/nb-roberta-base
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_steps:
+  desc: null
+  value: 10000
+weight_decay:
+  desc: null
+  value: 0.01

wandb/run-20220114_221533-24dma583/files/diff.patch ADDED Viewed

File without changes

wandb/run-20220114_221533-24dma583/files/output.log ADDED Viewed

	@@ -0,0 +1,43 @@

+2022-01-14 22:15:40.254500: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
+2022-01-14 22:15:40.254546: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
+Epoch ... (1/3):   0%|                                                                                                          | 0/3 [00:00<?, ?it/s]
+Training...:   0%|                                                                                                          | 0/39919 [02:25<?, ?it/s]
+Epoch ... (1/3):   0%|                                                                                                          | 0/3 [03:13<?, ?it/s]
+Traceback (most recent call last):
+  File "run_mlm_flax.py", line 815, in <module>
+    main()
+  File "run_mlm_flax.py", line 723, in main
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/traceback_util.py", line 162, in reraise_with_filtered_traceback
+    return fun(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/api.py", line 2058, in cache_miss
+    out_tree, out_flat = f_pmapped_(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/api.py", line 1934, in f_pmapped
+    out = pxla.xla_pmap(
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1727, in bind
+    return call_bind(self, fun, *args, **params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1652, in call_bind
+    outs = primitive.process(top_trace, fun, tracers, params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 1730, in process
+    return trace.process_map(self, fun, tracers, params)
+  File "/data/flax/lib/python3.8/site-packages/jax/core.py", line 633, in process_call
+    return primitive.impl(f, *tracers, **params)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 778, in xla_pmap_impl
+    return compiled_fun(*args)
+  File "/data/flax/lib/python3.8/site-packages/jax/_src/profiler.py", line 206, in wrapper
+    return func(*args, **kwargs)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1502, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+jax._src.traceback_util.UnfilteredStackTrace: RuntimeError: RESOURCE_EXHAUSTED: Attempting to reserve 12.83G at the bottom of memory. That was not possible. There are 13.18G free, 0B reserved, and 12.71G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).
+The stack trace below excludes JAX-internal frames.
+The preceding is the original exception that occurred, unmodified.
+--------------------
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "run_mlm_flax.py", line 815, in <module>
+    main()
+  File "run_mlm_flax.py", line 723, in main
+    state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+  File "/data/flax/lib/python3.8/site-packages/jax/interpreters/pxla.py", line 1502, in execute_replicated
+    out_bufs = compiled.execute_sharded_on_local_devices(input_bufs)
+RuntimeError: RESOURCE_EXHAUSTED: Attempting to reserve 12.83G at the bottom of memory. That was not possible. There are 13.18G free, 0B reserved, and 12.71G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).

wandb/run-20220114_221533-24dma583/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,122 @@

+absl-py==1.0.0
+aiohttp==3.8.1
+aiosignal==1.2.0
+astunparse==1.6.3
+async-timeout==4.0.2
+attrs==21.4.0
+backcall==0.2.0
+cachetools==4.2.4
+certifi==2021.10.8
+charset-normalizer==2.0.10
+chex==0.1.0
+click==8.0.3
+clu==0.0.6
+configparser==5.2.0
+contextlib2==21.6.0
+cycler==0.11.0
+datasets==1.17.1.dev0
+decorator==5.1.0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.4.2
+flatbuffers==2.0
+flax==0.3.6
+fonttools==4.28.5
+frozenlist==1.2.0
+fsspec==2021.11.1
+future==0.18.2
+gast==0.4.0
+gitdb==4.0.9
+gitpython==3.1.26
+google-auth-oauthlib==0.4.6
+google-auth==2.3.3
+google-pasta==0.2.0
+googleapis-common-protos==1.54.0
+grpcio==1.43.0
+h5py==3.6.0
+huggingface-hub==0.2.1
+idna==3.3
+importlib-metadata==4.10.0
+importlib-resources==5.4.0
+ipython==7.31.0
+jax==0.2.26
+jaxlib==0.1.75
+jedi==0.18.1
+joblib==1.1.0
+keras-preprocessing==1.1.2
+keras==2.7.0
+kiwisolver==1.3.2
+libclang==12.0.0
+libtpu-nightly==0.1.dev20211208
+markdown==3.3.6
+matplotlib-inline==0.1.3
+matplotlib==3.5.1
+ml-collections==0.1.0
+msgpack==1.0.3
+multidict==5.2.0
+multiprocess==0.70.12.2
+numpy==1.22.0
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.1.0
+packaging==21.3
+pandas==1.3.5
+parso==0.8.3
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==9.0.0
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+prompt-toolkit==3.0.24
+protobuf==3.19.1
+psutil==5.9.0
+ptyprocess==0.7.0
+pyarrow==6.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pygments==2.11.1
+pyparsing==3.0.6
+python-dateutil==2.8.2
+pytz==2021.3
+pyyaml==6.0
+regex==2021.11.10
+requests-oauthlib==1.3.0
+requests==2.27.0
+rsa==4.8
+sacremoses==0.0.46
+scipy==1.7.3
+sentry-sdk==1.5.2
+setuptools==44.0.0
+shortuuid==1.0.8
+six==1.16.0
+smmap==5.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.7.0
+tensorflow-cpu==2.7.0
+tensorflow-datasets==4.4.0
+tensorflow-estimator==2.7.0
+tensorflow-io-gcs-filesystem==0.23.1
+tensorflow-metadata==1.5.0
+tensorflow==2.7.0
+termcolor==1.1.0
+tokenizers==0.11.2
+toolz==0.11.2
+tqdm==4.62.3
+traitlets==5.1.1
+transformers==4.16.0.dev0
+typing-extensions==3.10.0.2
+urllib3==1.26.7
+wandb==0.12.9
+wcwidth==0.2.5
+werkzeug==2.0.2
+wheel==0.37.1
+wrapt==1.13.3
+xxhash==2.0.2
+yarl==1.7.2
+yaspin==2.1.0
+zipp==3.7.0

wandb/run-20220114_221533-24dma583/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2022-01-14T22:15:37.284889",
+    "startedAt": "2022-01-14T22:15:33.798491",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=./",
+        "--model_type=roberta",
+        "--config_name=roberta-base",
+        "--tokenizer_name=NbAiLab/nb-roberta-base",
+        "--dataset_name=NbAiLab/NCC",
+        "--max_seq_length=128",
+        "--weight_decay=0.01",
+        "--per_device_train_batch_size=250",
+        "--per_device_eval_batch_size=250",
+        "--pad_to_max_length",
+        "--learning_rate=6e-4",
+        "--warmup_steps=10000",
+        "--overwrite_output_dir",
+        "--num_train_epochs=3",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--adam_epsilon=1e-6",
+        "--logging_steps=1000",
+        "--save_steps=1000",
+        "--eval_steps=1000",
+        "--do_train",
+        "--do_eval",
+        "--dtype=bfloat16",
+        "--push_to_hub"
+    ],
+    "state": "running",
+    "program": "run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/versae/roberta-base-ncc",
+        "commit": "502df078f73cf93ca9380fcac1c9b9c7598a445f"
+    },
+    "email": "versae@gmail.com",
+    "root": "/data/roberta-base-ncc",
+    "host": "t1v-n-eedfb410-w-0",
+    "username": "javierr",
+    "executable": "/data/flax/bin/python"
+}

wandb/run-20220114_221533-24dma583/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 208}}

wandb/run-20220114_221533-24dma583/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,187 @@

+2022-01-14 22:15:34,709 INFO    MainThread:7834 [internal.py:wandb_internal():87] W&B internal server running at pid: 7834, started at: 2022-01-14 22:15:34.709583
+2022-01-14 22:15:34,711 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: check_version
+2022-01-14 22:15:34,712 INFO    WriterThread:7834 [datastore.py:open_for_write():77] open: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/run-24dma583.wandb
+2022-01-14 22:15:34,712 DEBUG   SenderThread:7834 [sender.py:send():234] send: header
+2022-01-14 22:15:34,712 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: check_version
+2022-01-14 22:15:34,785 DEBUG   SenderThread:7834 [sender.py:send():234] send: run
+2022-01-14 22:15:34,980 INFO    SenderThread:7834 [dir_watcher.py:__init__():169] watching files in: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files
+2022-01-14 22:15:34,980 INFO    SenderThread:7834 [sender.py:_start_run_threads():804] run started: 24dma583 with start time 1642198533
+2022-01-14 22:15:34,980 DEBUG   SenderThread:7834 [sender.py:send():234] send: summary
+2022-01-14 22:15:34,980 INFO    SenderThread:7834 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 22:15:34,981 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: run_start
+2022-01-14 22:15:35,985 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-summary.json
+2022-01-14 22:15:37,284 DEBUG   HandlerThread:7834 [meta.py:__init__():40] meta init
+2022-01-14 22:15:37,284 DEBUG   HandlerThread:7834 [meta.py:__init__():54] meta init done
+2022-01-14 22:15:37,284 DEBUG   HandlerThread:7834 [meta.py:probe():214] probe
+2022-01-14 22:15:37,286 DEBUG   HandlerThread:7834 [meta.py:_setup_git():204] setup git
+2022-01-14 22:15:37,315 DEBUG   HandlerThread:7834 [meta.py:_setup_git():211] setup git done
+2022-01-14 22:15:37,315 DEBUG   HandlerThread:7834 [meta.py:_save_code():92] save code
+2022-01-14 22:15:37,326 DEBUG   HandlerThread:7834 [meta.py:_save_code():113] save code done
+2022-01-14 22:15:37,326 DEBUG   HandlerThread:7834 [meta.py:_save_patches():130] save patches
+2022-01-14 22:15:37,985 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/code/run_mlm_flax.py
+2022-01-14 22:15:37,986 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/code
+2022-01-14 22:15:39,986 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:15:40,986 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/diff.patch
+2022-01-14 22:15:42,987 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:15:45,607 ERROR   HandlerThread:7834 [meta.py:_save_patches():171] Error generating diff: Command '['git', 'diff', '--submodule=diff', 'HEAD']' timed out after 5 seconds
+2022-01-14 22:15:45,607 DEBUG   HandlerThread:7834 [meta.py:_save_patches():172] save patches done
+2022-01-14 22:15:45,607 DEBUG   HandlerThread:7834 [meta.py:_save_pip():58] save pip
+2022-01-14 22:15:45,607 DEBUG   HandlerThread:7834 [meta.py:_save_pip():72] save pip done
+2022-01-14 22:15:45,608 DEBUG   HandlerThread:7834 [meta.py:probe():252] probe done
+2022-01-14 22:15:45,643 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:15:45,643 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:15:45,786 DEBUG   SenderThread:7834 [sender.py:send():234] send: config
+2022-01-14 22:15:45,787 DEBUG   SenderThread:7834 [sender.py:send():234] send: config
+2022-01-14 22:15:45,787 DEBUG   SenderThread:7834 [sender.py:send():234] send: config
+2022-01-14 22:15:45,787 DEBUG   SenderThread:7834 [sender.py:send():234] send: files
+2022-01-14 22:15:45,787 INFO    SenderThread:7834 [sender.py:_save_file():939] saving file wandb-metadata.json with policy now
+2022-01-14 22:15:45,789 INFO    SenderThread:7834 [sender.py:_save_file():939] saving file code/run_mlm_flax.py with policy now
+2022-01-14 22:15:45,988 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-metadata.json
+2022-01-14 22:15:45,989 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_created():217] file/dir created: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/requirements.txt
+2022-01-14 22:15:46,312 INFO    Thread-12 :7834 [upload_job.py:push():137] Uploaded file /tmp/tmpxqv1l1fswandb/2juok80v-code/run_mlm_flax.py
+2022-01-14 22:15:46,330 INFO    Thread-11 :7834 [upload_job.py:push():137] Uploaded file /tmp/tmpxqv1l1fswandb/xnc44171-wandb-metadata.json
+2022-01-14 22:15:50,990 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:15:59,991 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:15:59,991 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:16:05,368 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:16:05,997 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/config.yaml
+2022-01-14 22:16:15,132 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:16:15,132 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:16:30,272 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:16:30,272 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:16:35,439 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:16:39,009 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:16:45,408 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:16:45,408 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:17:00,601 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:17:00,601 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:17:05,512 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:17:15,756 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:17:15,756 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:17:30,970 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:17:30,971 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:17:35,586 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:17:46,135 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:17:46,136 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:18:01,309 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:18:01,309 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:18:05,663 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:18:16,458 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:18:16,458 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:18:31,596 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:18:31,597 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:18:35,742 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:18:46,731 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: stop_status
+2022-01-14 22:18:46,732 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: stop_status
+2022-01-14 22:19:03,067 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:19:03,953 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:03,953 DEBUG   SenderThread:7834 [sender.py:send():234] send: telemetry
+2022-01-14 22:19:03,953 DEBUG   SenderThread:7834 [sender.py:send():234] send: exit
+2022-01-14 22:19:03,953 INFO    SenderThread:7834 [sender.py:send_exit():366] handling exit code: 1
+2022-01-14 22:19:03,954 INFO    SenderThread:7834 [sender.py:send_exit():368] handling runtime: 208
+2022-01-14 22:19:03,954 INFO    SenderThread:7834 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 22:19:03,954 INFO    SenderThread:7834 [sender.py:send_exit():374] send defer
+2022-01-14 22:19:03,954 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:03,955 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:03,955 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 0
+2022-01-14 22:19:03,955 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:03,955 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 0
+2022-01-14 22:19:03,955 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 1
+2022-01-14 22:19:03,955 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:03,955 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 1
+2022-01-14 22:19:04,011 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:04,011 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 1
+2022-01-14 22:19:04,011 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 2
+2022-01-14 22:19:04,011 DEBUG   SenderThread:7834 [sender.py:send():234] send: stats
+2022-01-14 22:19:04,012 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:04,012 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 2
+2022-01-14 22:19:04,012 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:04,012 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 2
+2022-01-14 22:19:04,012 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 3
+2022-01-14 22:19:04,012 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:04,012 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 3
+2022-01-14 22:19:04,012 DEBUG   SenderThread:7834 [sender.py:send():234] send: summary
+2022-01-14 22:19:04,013 INFO    SenderThread:7834 [sender.py:_save_file():939] saving file wandb-summary.json with policy end
+2022-01-14 22:19:04,013 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:04,013 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 3
+2022-01-14 22:19:04,013 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 4
+2022-01-14 22:19:04,013 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:04,013 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 4
+2022-01-14 22:19:04,013 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:04,013 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 4
+2022-01-14 22:19:04,057 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:04,068 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:19:04,068 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-summary.json
+2022-01-14 22:19:04,198 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 5
+2022-01-14 22:19:04,198 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:04,198 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:04,198 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 5
+2022-01-14 22:19:04,199 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:04,199 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 5
+2022-01-14 22:19:04,199 INFO    SenderThread:7834 [dir_watcher.py:finish():283] shutting down directory watcher
+2022-01-14 22:19:04,300 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,068 INFO    Thread-8  :7834 [dir_watcher.py:_on_file_modified():230] file/dir modified: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/config.yaml
+2022-01-14 22:19:05,069 INFO    SenderThread:7834 [dir_watcher.py:finish():313] scan: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files
+2022-01-14 22:19:05,069 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/config.yaml config.yaml
+2022-01-14 22:19:05,069 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/diff.patch diff.patch
+2022-01-14 22:19:05,069 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/requirements.txt requirements.txt
+2022-01-14 22:19:05,069 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log output.log
+2022-01-14 22:19:05,070 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-summary.json wandb-summary.json
+2022-01-14 22:19:05,070 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-metadata.json wandb-metadata.json
+2022-01-14 22:19:05,072 INFO    SenderThread:7834 [dir_watcher.py:finish():327] scan save: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/code/run_mlm_flax.py code/run_mlm_flax.py
+2022-01-14 22:19:05,073 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 6
+2022-01-14 22:19:05,073 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,081 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:05,081 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 6
+2022-01-14 22:19:05,081 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:05,081 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 6
+2022-01-14 22:19:05,081 INFO    SenderThread:7834 [file_pusher.py:finish():177] shutting down file pusher
+2022-01-14 22:19:05,183 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,183 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,285 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,285 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,387 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,387 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,488 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,489 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,539 INFO    Thread-13 :7834 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/config.yaml
+2022-01-14 22:19:05,556 INFO    Thread-15 :7834 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/output.log
+2022-01-14 22:19:05,561 INFO    Thread-14 :7834 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/requirements.txt
+2022-01-14 22:19:05,590 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,590 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,599 INFO    Thread-16 :7834 [upload_job.py:push():137] Uploaded file /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/files/wandb-summary.json
+2022-01-14 22:19:05,692 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,692 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,794 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:05,794 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:05,799 INFO    Thread-7  :7834 [sender.py:transition_state():387] send defer: 7
+2022-01-14 22:19:05,800 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:05,800 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 7
+2022-01-14 22:19:05,800 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:05,800 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 7
+2022-01-14 22:19:05,896 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:06,218 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 8
+2022-01-14 22:19:06,218 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:06,218 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:06,218 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 8
+2022-01-14 22:19:06,219 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:06,219 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 8
+2022-01-14 22:19:06,219 INFO    SenderThread:7834 [sender.py:transition_state():387] send defer: 9
+2022-01-14 22:19:06,219 DEBUG   SenderThread:7834 [sender.py:send():234] send: final
+2022-01-14 22:19:06,219 DEBUG   SenderThread:7834 [sender.py:send():234] send: footer
+2022-01-14 22:19:06,219 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: defer
+2022-01-14 22:19:06,220 INFO    HandlerThread:7834 [handler.py:handle_request_defer():147] handle defer: 9
+2022-01-14 22:19:06,220 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: defer
+2022-01-14 22:19:06,220 INFO    SenderThread:7834 [sender.py:send_request_defer():383] handle sender defer: 9
+2022-01-14 22:19:06,320 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: poll_exit
+2022-01-14 22:19:06,320 DEBUG   SenderThread:7834 [sender.py:send_request():248] send_request: poll_exit
+2022-01-14 22:19:06,320 INFO    SenderThread:7834 [file_pusher.py:join():182] waiting for file pusher
+2022-01-14 22:19:06,598 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: get_summary
+2022-01-14 22:19:06,618 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: sampled_history
+2022-01-14 22:19:06,619 DEBUG   HandlerThread:7834 [handler.py:handle_request():130] handle_request: shutdown
+2022-01-14 22:19:06,619 INFO    HandlerThread:7834 [handler.py:finish():731] shutting down handler
+2022-01-14 22:19:07,220 INFO    WriterThread:7834 [datastore.py:close():281] close: /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/run-24dma583.wandb
+2022-01-14 22:19:07,575 INFO    SenderThread:7834 [sender.py:finish():1070] shutting down sender
+2022-01-14 22:19:07,575 INFO    SenderThread:7834 [file_pusher.py:finish():177] shutting down file pusher
+2022-01-14 22:19:07,576 INFO    SenderThread:7834 [file_pusher.py:join():182] waiting for file pusher
+2022-01-14 22:19:07,578 INFO    MainThread:7834 [internal.py:handle_exit():77] Internal process exited

wandb/run-20220114_221533-24dma583/logs/debug.log ADDED Viewed

	@@ -0,0 +1,141 @@

+2022-01-14 22:15:33,825 INFO    MainThread:4503 [wandb_setup.py:_flush():71] setting env: {}
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_setup.py:_flush():71] setting login settings: {}
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_init.py:_log_setup():371] Logging user logs to /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/logs/debug.log
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_init.py:_log_setup():372] Logging internal logs to /data/roberta-base-ncc/wandb/run-20220114_221533-24dma583/logs/debug-internal.log
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_init.py:init():404] calling init triggers
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_init.py:init():409] wandb.init called with sweep_config: {}
+config: {}
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [wandb_init.py:init():460] starting backend
+2022-01-14 22:15:33,826 INFO    MainThread:4503 [backend.py:_multiprocessing_setup():99] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2022-01-14 22:15:33,871 INFO    MainThread:4503 [backend.py:ensure_launched():216] starting backend process...
+2022-01-14 22:15:33,898 INFO    MainThread:4503 [backend.py:ensure_launched():221] started backend process with pid: 7834
+2022-01-14 22:15:33,900 INFO    MainThread:4503 [wandb_init.py:init():469] backend started and connected
+2022-01-14 22:15:33,911 INFO    MainThread:4503 [wandb_init.py:init():533] updated telemetry
+2022-01-14 22:15:33,976 INFO    MainThread:4503 [wandb_init.py:init():563] communicating current version
+2022-01-14 22:15:34,784 INFO    MainThread:4503 [wandb_init.py:init():568] got version response
+2022-01-14 22:15:34,784 INFO    MainThread:4503 [wandb_init.py:init():578] communicating run to backend with 30 second timeout
+2022-01-14 22:15:34,980 INFO    MainThread:4503 [wandb_init.py:init():606] starting run threads in backend
+2022-01-14 22:15:39,985 INFO    MainThread:4503 [wandb_run.py:_console_start():1810] atexit reg
+2022-01-14 22:15:39,985 INFO    MainThread:4503 [wandb_run.py:_redirect():1684] redirect: SettingsConsole.REDIRECT
+2022-01-14 22:15:39,986 INFO    MainThread:4503 [wandb_run.py:_redirect():1689] Redirecting console.
+2022-01-14 22:15:39,988 INFO    MainThread:4503 [wandb_run.py:_redirect():1745] Redirects installed.
+2022-01-14 22:15:39,988 INFO    MainThread:4503 [wandb_init.py:init():633] run started, returning control to user process
+2022-01-14 22:15:39,989 INFO    MainThread:4503 [wandb_run.py:_config_callback():956] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'per_device_train_batch_size': 250, 'per_device_eval_batch_size': 250, 'learning_rate': 0.0006, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-06, 'adafactor': False, 'num_train_epochs': 3.0, 'warmup_steps': 10000, 'logging_steps': 1000, 'save_steps': 1000, 'eval_steps': 1000, 'seed': 42, 'push_to_hub': True, 'hub_model_id': None, 'hub_token': None}
+2022-01-14 22:15:39,989 INFO    MainThread:4503 [wandb_run.py:_config_callback():956] config_cb None None {'model_name_or_path': None, 'model_type': 'roberta', 'config_name': 'roberta-base', 'tokenizer_name': 'NbAiLab/nb-roberta-base', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2022-01-14 22:15:39,990 INFO    MainThread:4503 [wandb_run.py:_config_callback():956] config_cb None None {'dataset_name': 'NbAiLab/NCC', 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 128, 'preprocessing_num_workers': None, 'mlm_probability': 0.15, 'pad_to_max_length': True, 'line_by_line': False}
+2022-01-14 22:19:01,641 INFO    MainThread:4503 [wandb_run.py:_atexit_cleanup():1780] got exitcode: 1
+2022-01-14 22:19:01,645 INFO    MainThread:4503 [wandb_run.py:_restore():1752] restore
+2022-01-14 22:19:03,955 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-14 22:19:04,199 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-14 22:19:05,082 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,184 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,286 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,387 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,489 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,591 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,693 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:05,795 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:06,219 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+2022-01-14 22:19:06,576 INFO    MainThread:4503 [wandb_run.py:_wait_for_finish():1912] got exit ret: done: true
+exit_result {
+}
+file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 45535
+  total_bytes: 45535
+}
+local_info {
+}
+2022-01-14 22:19:09,886 INFO    MainThread:4503 [wandb_run.py:_append_files():2180] logging synced files

wandb/run-20220114_221533-24dma583/run-24dma583.wandb ADDED Viewed

Binary file (7.18 kB). View file

wandb/run-20220114_234119-1zya86oe/files/code/run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1,815 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=fill-mask
+"""
+import json
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level=logging.INFO,
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='versae',
+                project='roberta-base-ncc',
+                sync_tensorboard=False,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
+        eval_normalizer = eval_metrics.pop("normalizer")
+        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+        try:
+            perplexity = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        eval_metrics["perplexity"] = perplexity
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

wandb/run-20220114_234119-1zya86oe/files/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.12.9
+    code_path: code/run_mlm_flax.py
+    framework: huggingface
+    huggingface_version: 4.16.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    start_time: 1642203679
+    t:
+      1:
+      - 2
+      - 3
+      - 11
+      - 12
+      2:
+      - 2
+      - 3
+      - 11
+      - 12
+      4: 3.8.10
+      5: 0.12.9
+      6: 4.16.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-06
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: roberta-base
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: NbAiLab/NCC
+do_eval:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: bfloat16
+eval_steps:
+  desc: null
+  value: 1000
+hub_model_id:
+  desc: null
+  value: null
+hub_token:
+  desc: null
+  value: null
+learning_rate:
+  desc: null
+  value: 0.0006
+line_by_line:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 1000
+max_seq_length:
+  desc: null
+  value: 128
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: roberta
+num_train_epochs:
+  desc: null
+  value: 3.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: true
+per_device_eval_batch_size:
+  desc: null
+  value: 232
+per_device_train_batch_size:
+  desc: null
+  value: 232
+preprocessing_num_workers:
+  desc: null
+  value: null
+push_to_hub:
+  desc: null
+  value: true
+save_steps:
+  desc: null
+  value: 1000
+seed:
+  desc: null
+  value: 42
+tokenizer_name:
+  desc: null
+  value: NbAiLab/nb-roberta-base
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_steps:
+  desc: null
+  value: 10000
+weight_decay:
+  desc: null
+  value: 0.01

wandb/run-20220114_234119-1zya86oe/files/diff.patch ADDED Viewed

File without changes

wandb/run-20220114_234119-1zya86oe/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85f905cb3060152eb20f5227d157a7605e33c762b80a6b8f4792e5791a1acd2b
+size 26403055

wandb/run-20220114_234119-1zya86oe/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,122 @@

+absl-py==1.0.0
+aiohttp==3.8.1
+aiosignal==1.2.0
+astunparse==1.6.3
+async-timeout==4.0.2
+attrs==21.4.0
+backcall==0.2.0
+cachetools==4.2.4
+certifi==2021.10.8
+charset-normalizer==2.0.10
+chex==0.1.0
+click==8.0.3
+clu==0.0.6
+configparser==5.2.0
+contextlib2==21.6.0
+cycler==0.11.0
+datasets==1.17.1.dev0
+decorator==5.1.0
+dill==0.3.4
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.4.2
+flatbuffers==2.0
+flax==0.3.6
+fonttools==4.28.5
+frozenlist==1.2.0
+fsspec==2021.11.1
+future==0.18.2
+gast==0.4.0
+gitdb==4.0.9
+gitpython==3.1.26
+google-auth-oauthlib==0.4.6
+google-auth==2.3.3
+google-pasta==0.2.0
+googleapis-common-protos==1.54.0
+grpcio==1.43.0
+h5py==3.6.0
+huggingface-hub==0.2.1
+idna==3.3
+importlib-metadata==4.10.0
+importlib-resources==5.4.0
+ipython==7.31.0
+jax==0.2.26
+jaxlib==0.1.75
+jedi==0.18.1
+joblib==1.1.0
+keras-preprocessing==1.1.2
+keras==2.7.0
+kiwisolver==1.3.2
+libclang==12.0.0
+libtpu-nightly==0.1.dev20211208
+markdown==3.3.6
+matplotlib-inline==0.1.3
+matplotlib==3.5.1
+ml-collections==0.1.0
+msgpack==1.0.3
+multidict==5.2.0
+multiprocess==0.70.12.2
+numpy==1.22.0
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.1.0
+packaging==21.3
+pandas==1.3.5
+parso==0.8.3
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==9.0.0
+pip==20.0.2
+pkg-resources==0.0.0
+promise==2.3
+prompt-toolkit==3.0.24
+protobuf==3.19.1
+psutil==5.9.0
+ptyprocess==0.7.0
+pyarrow==6.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pygments==2.11.1
+pyparsing==3.0.6
+python-dateutil==2.8.2
+pytz==2021.3
+pyyaml==6.0
+regex==2021.11.10
+requests-oauthlib==1.3.0
+requests==2.27.0
+rsa==4.8
+sacremoses==0.0.46
+scipy==1.7.3
+sentry-sdk==1.5.2
+setuptools==44.0.0
+shortuuid==1.0.8
+six==1.16.0
+smmap==5.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.7.0
+tensorflow-cpu==2.7.0
+tensorflow-datasets==4.4.0
+tensorflow-estimator==2.7.0
+tensorflow-io-gcs-filesystem==0.23.1
+tensorflow-metadata==1.5.0
+tensorflow==2.7.0
+termcolor==1.1.0
+tokenizers==0.11.2
+toolz==0.11.2
+tqdm==4.62.3
+traitlets==5.1.1
+transformers==4.16.0.dev0
+typing-extensions==3.10.0.2
+urllib3==1.26.7
+wandb==0.12.9
+wcwidth==0.2.5
+werkzeug==2.0.2
+wheel==0.37.1
+wrapt==1.13.3
+xxhash==2.0.2
+yarl==1.7.2
+yaspin==2.1.0
+zipp==3.7.0

wandb/run-20220114_234119-1zya86oe/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2022-01-14T23:41:22.464389",
+    "startedAt": "2022-01-14T23:41:19.296699",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=./",
+        "--model_type=roberta",
+        "--config_name=roberta-base",
+        "--tokenizer_name=NbAiLab/nb-roberta-base",
+        "--dataset_name=NbAiLab/NCC",
+        "--max_seq_length=128",
+        "--weight_decay=0.01",
+        "--per_device_train_batch_size=232",
+        "--per_device_eval_batch_size=232",
+        "--pad_to_max_length",
+        "--learning_rate=6e-4",
+        "--warmup_steps=10000",
+        "--overwrite_output_dir",
+        "--num_train_epochs=3",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--adam_epsilon=1e-6",
+        "--logging_steps=1000",
+        "--save_steps=1000",
+        "--eval_steps=1000",
+        "--do_train",
+        "--do_eval",
+        "--dtype=bfloat16",
+        "--push_to_hub"
+    ],
+    "state": "running",
+    "program": "run_mlm_flax.py",
+    "codePath": "run_mlm_flax.py",
+    "git": {
+        "remote": "https://huggingface.co/versae/roberta-base-ncc",
+        "commit": "502df078f73cf93ca9380fcac1c9b9c7598a445f"
+    },
+    "email": "versae@gmail.com",
+    "root": "/data/roberta-base-ncc",
+    "host": "t1v-n-eedfb410-w-0",
+    "username": "javierr",
+    "executable": "/data/flax/bin/python"
+}

wandb/run-20220114_234119-1zya86oe/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 356856}}

wandb/run-20220114_234119-1zya86oe/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a61912984f47a6ce815f00cc68ecb84206e92936ff60cd6573447587dd00aa
+size 38335870

wandb/run-20220114_234119-1zya86oe/logs/debug.log ADDED Viewed

	@@ -0,0 +1,168 @@

+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_setup.py:_flush():71] setting env: {}
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_setup.py:_flush():71] setting login settings: {}
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_init.py:_log_setup():371] Logging user logs to /data/roberta-base-ncc/wandb/run-20220114_234119-1zya86oe/logs/debug.log
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_init.py:_log_setup():372] Logging internal logs to /data/roberta-base-ncc/wandb/run-20220114_234119-1zya86oe/logs/debug-internal.log
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_init.py:init():404] calling init triggers
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_init.py:init():409] wandb.init called with sweep_config: {}
+config: {}
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [wandb_init.py:init():460] starting backend
+2022-01-14 23:41:19,298 INFO    MainThread:10537 [backend.py:_multiprocessing_setup():99] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2022-01-14 23:41:19,328 INFO    MainThread:10537 [backend.py:ensure_launched():216] starting backend process...
+2022-01-14 23:41:19,355 INFO    MainThread:10537 [backend.py:ensure_launched():221] started backend process with pid: 11989
+2022-01-14 23:41:19,357 INFO    MainThread:10537 [wandb_init.py:init():469] backend started and connected
+2022-01-14 23:41:19,369 INFO    MainThread:10537 [wandb_init.py:init():533] updated telemetry
+2022-01-14 23:41:19,437 INFO    MainThread:10537 [wandb_init.py:init():563] communicating current version
+2022-01-14 23:41:20,144 INFO    MainThread:10537 [wandb_init.py:init():568] got version response
+2022-01-14 23:41:20,145 INFO    MainThread:10537 [wandb_init.py:init():578] communicating run to backend with 30 second timeout
+2022-01-14 23:41:20,323 INFO    MainThread:10537 [wandb_init.py:init():606] starting run threads in backend
+2022-01-14 23:41:25,327 INFO    MainThread:10537 [wandb_run.py:_console_start():1810] atexit reg
+2022-01-14 23:41:25,328 INFO    MainThread:10537 [wandb_run.py:_redirect():1684] redirect: SettingsConsole.REDIRECT
+2022-01-14 23:41:25,328 INFO    MainThread:10537 [wandb_run.py:_redirect():1689] Redirecting console.
+2022-01-14 23:41:25,330 INFO    MainThread:10537 [wandb_run.py:_redirect():1745] Redirects installed.
+2022-01-14 23:41:25,331 INFO    MainThread:10537 [wandb_init.py:init():633] run started, returning control to user process
+2022-01-14 23:41:25,331 INFO    MainThread:10537 [wandb_run.py:_config_callback():956] config_cb None None {'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'per_device_train_batch_size': 232, 'per_device_eval_batch_size': 232, 'learning_rate': 0.0006, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.98, 'adam_epsilon': 1e-06, 'adafactor': False, 'num_train_epochs': 3.0, 'warmup_steps': 10000, 'logging_steps': 1000, 'save_steps': 1000, 'eval_steps': 1000, 'seed': 42, 'push_to_hub': True, 'hub_model_id': None, 'hub_token': None}
+2022-01-14 23:41:25,332 INFO    MainThread:10537 [wandb_run.py:_config_callback():956] config_cb None None {'model_name_or_path': None, 'model_type': 'roberta', 'config_name': 'roberta-base', 'tokenizer_name': 'NbAiLab/nb-roberta-base', 'cache_dir': None, 'use_fast_tokenizer': True, 'dtype': 'bfloat16'}
+2022-01-14 23:41:25,332 INFO    MainThread:10537 [wandb_run.py:_config_callback():956] config_cb None None {'dataset_name': 'NbAiLab/NCC', 'dataset_config_name': None, 'train_file': None, 'validation_file': None, 'train_ref_file': None, 'validation_ref_file': None, 'overwrite_cache': False, 'validation_split_percentage': 5, 'max_seq_length': 128, 'preprocessing_num_workers': None, 'mlm_probability': 0.15, 'pad_to_max_length': True, 'line_by_line': False}
+2022-01-19 02:48:53,379 INFO    MainThread:10537 [wandb_run.py:_atexit_cleanup():1780] got exitcode: 0
+2022-01-19 02:48:53,381 INFO    MainThread:10537 [wandb_run.py:_restore():1752] restore
+2022-01-19 02:48:56,346 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-19 02:48:56,559 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 1
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 37446
+}
+2022-01-19 02:48:56,919 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,021 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 37446
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,123 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 10068910
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,225 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 13042606
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,327 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 21463982
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,429 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,531 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,633 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,735 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,837 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:57,939 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:58,457 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+2022-01-19 02:48:58,818 INFO    MainThread:10537 [wandb_run.py:_wait_for_finish():1912] got exit ret: done: true
+exit_result {
+}
+file_counts {
+  wandb_count: 5
+  other_count: 1
+}
+pusher_stats {
+  uploaded_bytes: 26444957
+  total_bytes: 26444957
+}
+local_info {
+}
+2022-01-19 02:49:00,429 INFO    MainThread:10537 [wandb_run.py:_append_files():2180] logging synced files

wandb/run-20220114_234119-1zya86oe/run-1zya86oe.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f93d599b590506c08952e582e6ab64f317f0ac3488bf1937504d605ab8ecf5b
+size 118429569

wandb/run-20220119_161158-274aad95/files/code/run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1,815 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=fill-mask
+"""
+import json
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level=logging.INFO,
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples = [line for line in examples if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples,
+                return_special_tokens_mask=True,
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+            )
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            input_columns=[text_column_name],
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='versae',
+                project='roberta-base-ncc',
+                sync_tensorboard=False,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
+        eval_normalizer = eval_metrics.pop("normalizer")
+        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+        try:
+            perplexity = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        eval_metrics["perplexity"] = perplexity
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

wandb/run-20220119_161158-274aad95/files/config.yaml ADDED Viewed

	@@ -0,0 +1,147 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.12.9
+    code_path: code/run_mlm_flax.py
+    framework: huggingface
+    huggingface_version: 4.16.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    start_time: 1642608719
+    t:
+      1:
+      - 2
+      - 3
+      - 11
+      - 12
+      4: 3.8.10
+      5: 0.12.9
+      6: 4.16.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-06
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ./
+dataset_config_name:
+  desc: null
+  value: null
+dataset_name:
+  desc: null
+  value: NbAiLab/NCC
+do_eval:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: bfloat16
+eval_steps:
+  desc: null
+  value: 1000
+hub_model_id:
+  desc: null
+  value: null
+hub_token:
+  desc: null
+  value: null
+learning_rate:
+  desc: null
+  value: 0.0006
+line_by_line:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 1000
+max_seq_length:
+  desc: null
+  value: 512
+mlm_probability:
+  desc: null
+  value: 0.15
+model_name_or_path:
+  desc: null
+  value: ./
+model_type:
+  desc: null
+  value: roberta
+num_train_epochs:
+  desc: null
+  value: 3.0
+output_dir:
+  desc: null
+  value: ./
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+pad_to_max_length:
+  desc: null
+  value: true
+per_device_eval_batch_size:
+  desc: null
+  value: 46
+per_device_train_batch_size:
+  desc: null
+  value: 46
+preprocessing_num_workers:
+  desc: null
+  value: null
+push_to_hub:
+  desc: null
+  value: true
+save_steps:
+  desc: null
+  value: 1000
+seed:
+  desc: null
+  value: 42
+tokenizer_name:
+  desc: null
+  value: ./
+train_file:
+  desc: null
+  value: null
+train_ref_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+validation_file:
+  desc: null
+  value: null
+validation_ref_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_steps:
+  desc: null
+  value: 1000
+weight_decay:
+  desc: null
+  value: 0.01