File size: 1,673 Bytes
69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 4b104e3 69abda4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
import torch
import os
# Paths relative to this script so you can run from any cwd
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
model_id = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# BLOOM has no pad_token by default; required for batching
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id)
if model.config.pad_token_id is None:
model.config.pad_token_id = tokenizer.pad_token_id
dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train")
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
max_length=512,
)
tokenized_dataset = dataset.map(
tokenize,
remove_columns=dataset.column_names,
desc="Tokenizing",
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=2,
num_train_epochs=3,
logging_steps=10,
save_steps=500,
learning_rate=2e-5,
fp16=torch.cuda.is_available(),
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
|