File size: 744 Bytes
62dbf75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
MODEL = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)
ds = load_dataset("text", data_files="data.txt")
def tok(x):
return tokenizer(x["text"], truncation=True, max_length=128)
ds = ds.map(tok, batched=True, remove_columns=["text"])
args = TrainingArguments(
output_dir="out",
num_train_epochs=1,
per_device_train_batch_size=2,
logging_steps=20,
save_steps=500,
report_to="none"
)
Trainer(model=model, args=args, train_dataset=ds["train"]).train()
model.save_pretrained("out")
tokenizer.save_pretrained("out")
|