from datasets import load_dataset from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments dataset = load_dataset("json", data_files="nova1_data.json") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = GPT2LMHeadModel.from_pretrained("gpt2") def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, padding="max_length", max_length=128, ) tokenized_dataset = dataset["train"].map(tokenize_function, batched=True) tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) training_args = TrainingArguments( output_dir="./nova1_model", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=8, save_steps=100, save_total_limit=2, logging_steps=50, evaluation_strategy="no", report_to="none", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) trainer.train() trainer.save_model("./nova1_model") tokenizer.save_pretrained("./nova1_model")