training the tokenizer

In [4]:
!pip install git+https://github.com/huggingface/transformers 
!pip install torch
!pip list | grep -E 'transformers|tokenizers'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-jy7qmcjn
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-jy7qmcjn
  Resolved https://github.com/huggingface/transformers to commit 30992ef0d911bdeca425969d210771118a5cd1ac
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting regex!=2019.12.17
  Downloading regex-2022.8.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (752 kB)
[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 752 kB 2.0 MB/s eta 0:00:01
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.22.0.dev0-py3-none-any.whl size=4730003 sha256=82f934e

In [5]:
%%time 
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./data/test").glob("*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 1min 4s, sys: 1min 2s, total: 2min 6s
Wall time: 17.4 s


In [6]:
!mkdir rICE
tokenizer.save_model("rICE")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
mkdir: cannot create directory â€˜rICEâ€™: File exists


['rICE/vocab.json', 'rICE/merges.txt']

In [7]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./rICE/vocab.json",
    "./rICE/merges.txt",
)

In [8]:
tokenizer.encode("hundurinn minn").tokens

['h', 'undurinn', 'Ä minn']

Train the language model

In [9]:
# Check that we have a GPU
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Aug 21 17:31:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 516.94       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   33C    P8    16W / 250W |   1423MiB /  8192MiB |     18%      Default |
|                               |            

In [11]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [12]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,

    # num_attention_heads=6,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]


In [13]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./rICE", max_len=512)

In [14]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [15]:
model.num_parameters()
# => 83 million parameters

83504416

In [16]:
from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

class IcelandicDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./rICE/vocab.json",
            "./rICE/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=128)
        # tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/test").glob("*.txt") if evaluate else Path("./data/train").glob("*.txt")
        for src_file in src_files:
            print("ğŸ”¥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # Weâ€™ll pad at the batch level.
        return torch.tensor(self.examples[i])


dataset = IcelandicDataset()

ğŸ”¥ data/train/41170_news.txt
ğŸ”¥ data/train/20077_news.txt
ğŸ”¥ data/train/1980_news.txt
ğŸ”¥ data/train/31417_news.txt
ğŸ”¥ data/train/16586_news.txt
ğŸ”¥ data/train/20125_news.txt
ğŸ”¥ data/train/33153_news.txt
ğŸ”¥ data/train/24792_news.txt
ğŸ”¥ data/train/3405_news.txt
ğŸ”¥ data/train/19859_news.txt
ğŸ”¥ data/train/5222_news.txt
ğŸ”¥ data/train/33356_news.txt
ğŸ”¥ data/train/9107_news.txt
ğŸ”¥ data/train/26357_news.txt
ğŸ”¥ data/train/20623_news.txt
ğŸ”¥ data/train/543_news.txt
ğŸ”¥ data/train/34212_news.txt
ğŸ”¥ data/train/7773_news.txt
ğŸ”¥ data/train/24625_news.txt
ğŸ”¥ data/train/42520_news.txt
ğŸ”¥ data/train/36850_news.txt
ğŸ”¥ data/train/21595_news.txt
ğŸ”¥ data/train/13719_news.txt
ğŸ”¥ data/train/19161_news.txt
ğŸ”¥ data/train/39148_news.txt
ğŸ”¥ data/train/36384_news.txt
ğŸ”¥ data/train/45412_news.txt
ğŸ”¥ data/train/4379_news.txt
ğŸ”¥ data/train/31087_news.txt
ğŸ”¥ data/train/12715_news.txt
ğŸ”¥ data/train/13122_news.txt
ğŸ”¥ data/train/31550_news.txt
ğŸ”¥ data/train/

In [17]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./rICE",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [19]:
%%time
trainer.train()

***** Running training *****
  Num examples = 415062
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 103766


Step,Training Loss
500,8.65
1000,8.2084
1500,8.1327
2000,7.9749
2500,7.9396
3000,7.9153
3500,7.9056
4000,7.8028
4500,7.7427
5000,7.7145


Saving model checkpoint to ./rICE/checkpoint-10000
Configuration saved in ./rICE/checkpoint-10000/config.json
Model weights saved in ./rICE/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to ./rICE/checkpoint-20000
Configuration saved in ./rICE/checkpoint-20000/config.json
Model weights saved in ./rICE/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to ./rICE/checkpoint-30000
Configuration saved in ./rICE/checkpoint-30000/config.json
Model weights saved in ./rICE/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [rICE/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to ./rICE/checkpoint-40000
Configuration saved in ./rICE/checkpoint-40000/config.json
Model weights saved in ./rICE/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [rICE/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to ./rICE/checkpoint-50000
Configuration saved in ./rICE/checkpoint-50000/config.json
Model weights saved in ./rICE/ch

CPU times: user 1h 20min 36s, sys: 5min 57s, total: 1h 26min 33s
Wall time: 1h 26min 15s


TrainOutput(global_step=103766, training_loss=6.387869066765909, metrics={'train_runtime': 5175.7854, 'train_samples_per_second': 80.193, 'train_steps_per_second': 20.048, 'total_flos': 8553983519672448.0, 'train_loss': 6.387869066765909, 'epoch': 1.0})

In [20]:
trainer.save_model('./rICE')

Saving model checkpoint to ./rICE
Configuration saved in ./rICE/config.json
Model weights saved in ./rICE/pytorch_model.bin


using the model

In [21]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./rICE",
    tokenizer="./rICE"
)

loading configuration file ./rICE/config.json
Model config RobertaConfig {
  "_name_or_path": "./rICE",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./rICE/config.json
Model config RobertaConfig {
  "_name_or_path": "./rICE",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_toke

In [22]:
fill_mask("Ã‰g heiti Aron og elska aÃ° borÃ°a <mask> og epli'")


[{'score': 0.10871271789073944,
  'token': 684,
  'token_str': ' eins',
  'sequence': "Ã‰g heiti Aron og elska aÃ° borÃ°a eins og epli'"},
 {'score': 0.01858321763575077,
  'token': 650,
  'token_str': ' vel',
  'sequence': "Ã‰g heiti Aron og elska aÃ° borÃ°a vel og epli'"},
 {'score': 0.010185176506638527,
  'token': 322,
  'token_str': ' viÃ°',
  'sequence': "Ã‰g heiti Aron og elska aÃ° borÃ°a viÃ° og epli'"},
 {'score': 0.007654798217117786,
  'token': 980,
  'token_str': ' Ã¡fram',
  'sequence': "Ã‰g heiti Aron og elska aÃ° borÃ°a Ã¡fram og epli'"},
 {'score': 0.007473105099052191,
  'token': 1289,
  'token_str': ' heima',
  'sequence': "Ã‰g heiti Aron og elska aÃ° borÃ°a heima og epli'"}]

In [None]:
!zip -r /content/file.zip /content/rICE/

In [None]:
# from google.colab import files
# files.download("/content/file.zip")