from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments # Specify the model and tokenizer model_name = "gpt2" model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) # Create a dataset from your text file dataset = TextDataset( tokenizer=tokenizer, file_path="my_text_file.txt", block_size=128, ) # Create a data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) # Specify the training arguments training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=1, save_steps=10_000, save_total_limit=2, ) # Create a trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) # Train the model trainer.train()