File size: 1,202 Bytes
d61b14b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
dataset = load_dataset("json", data_files="nova1_data.json")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=128,
)
tokenized_dataset = dataset["train"].map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
training_args = TrainingArguments(
output_dir="./nova1_model",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=100,
save_total_limit=2,
logging_steps=50,
evaluation_strategy="no",
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
trainer.train()
trainer.save_model("./nova1_model")
tokenizer.save_pretrained("./nova1_model")
|