| from datasets import load_dataset | |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments | |
| dataset = load_dataset("json", data_files="nova1_data.json") | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = GPT2LMHeadModel.from_pretrained("gpt2") | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=128, | |
| ) | |
| tokenized_dataset = dataset["train"].map(tokenize_function, batched=True) | |
| tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) | |
| training_args = TrainingArguments( | |
| output_dir="./nova1_model", | |
| overwrite_output_dir=True, | |
| num_train_epochs=3, | |
| per_device_train_batch_size=8, | |
| save_steps=100, | |
| save_total_limit=2, | |
| logging_steps=50, | |
| evaluation_strategy="no", | |
| report_to="none", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| ) | |
| trainer.train() | |
| trainer.save_model("./nova1_model") | |
| tokenizer.save_pretrained("./nova1_model") | |