import os os.environ["HF_HOME"] = "./hf_cache" from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForMaskedLM # Load dataset from Hugging Face Hub dataset = load_dataset("drzeeIslam/nelson-gpt-chunks") # Load tokenizer and model model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) # Tokenize the texts def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) tokenized_datasets = dataset.map(tokenize_function, batched=True) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) # Training arguments training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=8, num_train_epochs=3, save_steps=500, save_total_limit=2, logging_steps=50, push_to_hub=False ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], tokenizer=tokenizer, data_collator=data_collator ) # Start training trainer.train()