from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, ) from datasets import load_dataset import torch import os # Paths relative to this script so you can run from any cwd SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl") OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model") model_id = "bigscience/bloom-560m" tokenizer = AutoTokenizer.from_pretrained(model_id) # BLOOM has no pad_token by default; required for batching if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(model_id) if model.config.pad_token_id is None: model.config.pad_token_id = tokenizer.pad_token_id dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train") def tokenize(example): return tokenizer( example["text"], truncation=True, max_length=512, ) tokenized_dataset = dataset.map( tokenize, remove_columns=dataset.column_names, desc="Tokenizing", ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) training_args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=2, num_train_epochs=3, logging_steps=10, save_steps=500, learning_rate=2e-5, fp16=torch.cuda.is_available(), ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator, ) trainer.train() model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR)