from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments from datasets import load_dataset MODEL_DIR = "./68h" tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_DIR) model = GPT2LMHeadModel.from_pretrained(MODEL_DIR) dataset = load_dataset("text", data_files={"train": "html_text_dataset.txt"}) def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512) tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"]) args = TrainingArguments( output_dir="./out", per_device_train_batch_size=2, num_train_epochs=1, save_steps=500, logging_steps=100 ) trainer = Trainer( model=model, args=args, train_dataset=tokenized["train"] ) trainer.train()