| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling, |
| ) |
| from datasets import load_dataset |
| import torch |
| import os |
|
|
| |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl") |
| OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model") |
|
|
| model_id = "bigscience/bloom-560m" |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| model = AutoModelForCausalLM.from_pretrained(model_id) |
| if model.config.pad_token_id is None: |
| model.config.pad_token_id = tokenizer.pad_token_id |
|
|
| dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train") |
|
|
| def tokenize(example): |
| return tokenizer( |
| example["text"], |
| truncation=True, |
| max_length=512, |
| ) |
|
|
| tokenized_dataset = dataset.map( |
| tokenize, |
| remove_columns=dataset.column_names, |
| desc="Tokenizing", |
| ) |
|
|
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False, |
| ) |
|
|
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| per_device_train_batch_size=2, |
| num_train_epochs=3, |
| logging_steps=10, |
| save_steps=500, |
| learning_rate=2e-5, |
| fp16=torch.cuda.is_available(), |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset, |
| data_collator=data_collator, |
| ) |
|
|
| trainer.train() |
|
|
| model.save_pretrained(OUTPUT_DIR) |
| tokenizer.save_pretrained(OUTPUT_DIR) |
|
|