Init

Browse files

Files changed (4) hide show

prepare.py +63 -0
preprocess.py +73 -0
run_cat.py +52 -0
run_mlm_local.py +130 -0

prepare.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#
+# 1. [Prepare the dataset](#1-prepare-the-dataset)
+# 2. [Train a Tokenizer](#2-train-a-tokenizer)
+# 3. [Preprocess the dataset](#3-preprocess-the-dataset)
+# 4. [Pre-train BERT on Habana Gaudi](#4-pre-train-bert-on-habana-gaudi)
+#
+# _Note: Step 1 to 3 can/should be run on a different instance size those are CPU intensive tasks._
+# %%
+# ## 1. Prepare the dataset
+# Log into the [Hugging Face Hub](https://huggingface.co/models) to push our dataset, tokenizer, model artifacts, logs and metrics during training and afterwards to the hub.
+from huggingface_hub import HfApi
+user_id = HfApi().whoami()["name"]
+print(f"user id '{user_id}' will be used during the example")
+from datasets import concatenate_datasets, load_dataset
+# The [original BERT](https://arxiv.org/abs/1810.04805) was pretrained on [Wikipedia](https://huggingface.co/datasets/wikipedia) and [BookCorpus](https://huggingface.co/datasets/bookcorpus) dataset. Both datasets are available on the [Hugging Face Hub](https://huggingface.co/datasets) and can be loaded with `datasets`.
+#
+# _Note: For wikipedia we will use the `20220301`, which is different to the original split._
+#
+# As a first step are we loading the dataset and merging them together to create on big dataset.
+bookcorpus = load_dataset("bookcorpus", split="train")
+wiki = load_dataset("wikipedia", "20220301.en", split="train")
+wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column
+assert bookcorpus.features.type == wiki.features.type
+raw_datasets = concatenate_datasets([bookcorpus, wiki])
+print(raw_datasets)
+# %% [markdown]
+# > We are not going to do some advanced dataset preparation, like de-duplication, filtering or any other pre-processing. If you are planning to apply this notebook to train your own BERT model from scratch I highly recommend to including those data preparation steps into your workflow. This will help you improve your Language Model.
+# ## 2. Train a Tokenizer
+#
+# To be able to train our model we need to convert our text into a tokenized format. Most Transformer models are coming with a pre-trained tokenizer, but since we are pre-training our model from scratch we also need to train a Tokenizer on our data. We can train a tokenizer on our data with `transformers` and the `BertTokenizerFast` class.
+#
+# More information about training a new tokenizer can be found in our [Hugging Face Course](https://huggingface.co/course/chapter6/2?fw=pt).
+from tqdm import tqdm
+from transformers import BertTokenizerFast
+# repositor id for saving the tokenizer
+tokenizer_id="chaoyan/bert-base-uncased-cat"
+# create a python generator to dynamically load the data
+def batch_iterator(batch_size=10000):
+    for i in tqdm(range(0, len(raw_datasets), batch_size)):
+        yield raw_datasets[i : i + batch_size]["text"]
+tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=32_000)
+print(tokenizer)
+bert_tokenizer.save_pretrained("cat_tokenizer")
+# We push the tokenizer to [Hugging Face Hub](https://huggingface.co/models) for later training our model.
+bert_tokenizer.push_to_hub(tokenizer_id)

preprocess.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from datasets import concatenate_datasets, load_dataset
+# The [original BERT](https://arxiv.org/abs/1810.04805) was pretrained on [Wikipedia](https://huggingface.co/datasets/wikipedia) and [BookCorpus](https://huggingface.co/datasets/bookcorpus) dataset. Both datasets are available on the [Hugging Face Hub](https://huggingface.co/datasets) and can be loaded with `datasets`.
+#
+# _Note: For wikipedia we will use the `20220301`, which is different to the original split._
+#
+# As a first step are we loading the dataset and merging them together to create on big dataset.
+bookcorpus = load_dataset("bookcorpus", split="train")
+wiki = load_dataset("wikipedia", "20220301.en", split="train")
+wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column
+assert bookcorpus.features.type == wiki.features.type
+raw_datasets = concatenate_datasets([bookcorpus, wiki])
+print(raw_datasets)
+# %%
+# ## 3. Preprocess the dataset
+#
+# Before we can get started with training our model, the last step is to pre-process/tokenize our dataset. We will use our trained tokenizer to tokenize our dataset and then push it to hub to load it easily later in our training. The tokenization process is also kept pretty simple, if documents are longer than `512` tokens those are truncated and not split into several documents.
+from transformers import AutoTokenizer
+import multiprocessing
+# load tokenizer (from local or remote hub)
+# tokenizer = AutoTokenizer.from_pretrained(f"{user_id}/{tokenizer_id}")
+tokenizer = AutoTokenizer.from_pretrained("cat_tokenizer")
+num_proc = min(multiprocessing.cpu_count(), 8)
+print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")
+def group_texts(examples):
+    tokenized_inputs = tokenizer(
+       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
+    )
+    return tokenized_inputs
+# preprocess dataset
+tokenized_datasets = raw_datasets.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
+print(tokenized_datasets.features)
+# As data processing function will we concatenate all texts from our dataset and generate chunks of `tokenizer.model_max_length` (512).
+from itertools import chain
+# Main data processing function that will concatenate all texts from our dataset and generate chunks of
+# max_seq_length.
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    if total_length >= tokenizer.model_max_length:
+        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
+    # Split by chunks of max_len.
+    result = {
+        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
+        for k, t in concatenated_examples.items()
+    }
+    return result
+tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
+# shuffle dataset
+tokenized_datasets = tokenized_datasets.shuffle(seed=34)
+print(tokenized_datasets)
+print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")
+# the dataset contains in total 3417216000 tokens
+# tokenized_datasets.to_csv('processed_bert_dataset.csv')
+user_id = 'chaoyan'
+# push dataset to hugging face
+dataset_id=f"{user_id}/processed_bert_dataset"
+tokenized_datasets.push_to_hub(dataset_id)

run_cat.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# %%
+# ## 4. Pre-train BERT on processed dataset
+import os
+from huggingface_hub import HfFolder
+# hyperparameters
+hyperparameters = {
+    "model_config_id": "bert-base-uncased",
+    "dataset_id": "chaoyan/processed_bert_dataset",
+    "tokenizer_id": "cat_tokenizer",
+    "repository_id": "bert-base-uncased-cat",
+    "hf_hub_token": HfFolder.get_token(),  # need to be login in with `huggingface-cli login`
+    "max_steps": 100_000,
+    "per_device_train_batch_size": 16,
+    "learning_rate": 5e-5,
+}
+hyperparameters_string = " ".join(f"--{key} {value}" for key, value in hyperparameters.items())
+cmd_str = f"python3 run_mlm_local.py {hyperparameters_string}"
+os.system(cmd_str)
+# %% [markdown]
+# ![tensorboard logs](../assets/tensorboard.png)
+# _This [experiment](https://huggingface.co/philschmid/bert-base-uncased-2022-habana-test-6) ran for 60k steps_
+#
+# In our `hyperparameters` we defined a `max_steps` property, which limited the pre-training to only `100_000` steps. The `100_000` steps with a global batch size of `256` took around 12,5 hour.
+#
+# BERT was originial pre-trained on [1 Million Steps](https://arxiv.org/pdf/1810.04805.pdf) with a global batch size of `256`:
+# > We train with batch size of 256 sequences (256 sequences * 512 tokens = 128,000 tokens/batch) for 1,000,000 steps, which is approximately 40 epochs over the 3.3 billion word corpus.
+#
+# Meaning if we want to do a full pre-training it would take around 125h hours (12,5 hour * 10) and would cost us around ~$1,650 using  Habana Gaudi on AWS, which is extermely cheap.
+#
+# For comparison the DeepSpeed Team, who holds the record for the [fastest BERT-pretraining](https://www.deepspeed.ai/tutorials/bert-pretraining/) [reported](https://www.deepspeed.ai/tutorials/bert-pretraining/) that pre-training BERT on 1 [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) (powered by 16 NVIDIA V100 GPUs with 32GB of memory each) takes around 33,25 hours.
+#
+# To be able to compare the cost we can use the [p3dn.24xlarge](https://aws.amazon.com/de/ec2/instance-types/p3/) as reference, which comes with 8x NVIDIA V100 32GB GPUs and costs ~31,22$/h. We would need two of these instances to have the same "setup" as the one DeepSpeed reported, for now we are ignoring any overhead created to the multi-node setup (I/O, Network etc.).
+# This would bring the cost of the DeepSpeed GPU based training on AWS to around ~$2,075, which is 25% more than what Habana Gaudi currently delivers.
+# _Something to note here is that using [DeepSpeed](https://www.deepspeed.ai/tutorials/bert-pretraining/#deepspeed-single-gpu-throughput-results) in general improves the performance by a factor of ~2._
+#
+# We are looking forward on re-doing the experiment once the [Gaudi DeepSpeed integration](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide.html#deepspeed-configs) is more widely available.
+#
+#
+# ## Conlusion
+#
+# That's it for this tutorial. Now you know the basics on how to pre-train BERT from scratch using Hugging Face Transformers and Habana Gaudi. You also saw how easy it is to migrate from the `Trainer` to the `GaudiTrainer`.
+#
+# We compared our implementation with the [fastest BERT-pretraining](https://www.deepspeed.ai/tutorials/bert-pretraining/) results and saw that Habana Gaudi still delivers a 25% cost reduction and allows us to pre-train BERT for ~$1,650.
+#
+# Those results are incredible, since it will allow companies to adapt their pre-trained models to their language and domain to [improve accuracy up to 10%](https://huggingface.co/pile-of-law/legalbert-large-1.7M-1#evaluation-results) compared to the general BERT models.
+#

run_mlm_local.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import logging
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import (
+    HfArgumentParser,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    set_seed,
+    AutoConfig,
+    DataCollatorForLanguageModeling,
+)
+from transformers import Trainer, TrainingArguments
+from datasets import load_dataset
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ScriptArguments:
+    """
+    Arguments which aren't included in the TrainingArguments
+    """
+    dataset_id: str = field(
+        default=None, metadata={"help": "The repository id of the dataset to use (via the datasets library)."}
+    )
+    tokenizer_id: str = field(
+        default=None, metadata={"help": "The repository id of the tokenizer to use (via AutoTokenizer)."}
+    )
+    repository_id: str = field(
+        default=None,
+        metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
+    )
+    hf_hub_token: str = field(
+        default=False,
+        metadata={"help": "The Token used to push models, metrics and logs to the Hub."},
+    )
+    model_config_id: Optional[str] = field(
+        default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    per_device_train_batch_size: Optional[int] = field(
+        default=16,
+        metadata={"help": "The Batch Size per HPU used during training"},
+    )
+    max_steps: Optional[int] = field(
+        default=1_000_000,
+        metadata={"help": "The Number of Training steps to perform."},
+    )
+    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "Learning Rate for the training"})
+    mlm_probability: Optional[float] = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+def run_mlm():
+    # Parse arguments
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+    logger.info(f"Script parameters {script_args}")
+    # set seed for reproducibility
+    seed = 34
+    set_seed(seed)
+    # load processed dataset
+    train_dataset = load_dataset(script_args.dataset_id, split="train")
+    # load trained tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id, use_auth_token=script_args.hf_hub_token)
+    # load model from config (for training from scratch)
+    logger.info("Training new model from scratch")
+    config = AutoConfig.from_pretrained(script_args.model_config_id)
+    model = AutoModelForMaskedLM.from_config(config)
+    logger.info(f"Resizing token embedding to {len(tokenizer)}")
+    model.resize_token_embeddings(len(tokenizer))
+    # This one will take care of randomly masking the tokens.
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm_probability=script_args.mlm_probability, pad_to_multiple_of=8
+    )
+    # define our hyperparameters
+    training_args = TrainingArguments(
+        output_dir=script_args.repository_id,
+        per_device_train_batch_size=script_args.per_device_train_batch_size,
+        learning_rate=script_args.learning_rate,
+        seed=seed,
+        max_steps=script_args.max_steps,
+        # logging & evaluation strategies
+        logging_dir=f"{script_args.repository_id}/logs",
+        logging_strategy="steps",
+        logging_steps=100,
+        save_strategy="steps",
+        save_steps=5_000,
+        save_total_limit=2,
+        report_to="tensorboard",
+        # push to hub parameters
+        # hub_strategy="every_save",
+        # hub_model_id=script_args.repository_id,
+        # pretraining
+        ddp_find_unused_parameters=True,
+        # throughput_warmup_steps=2, # !!! ?
+    )
+    # Initialize our Trainer
+    trainer = Trainer(
+        args=training_args,
+        model=model,
+        train_dataset=train_dataset,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    # train the model
+    trainer.train()
+if __name__ == "__main__":
+    run_mlm()