| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments | |
| from datasets import load_dataset | |
| MODEL = "skt/kogpt2-base-v2" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL) | |
| ds = load_dataset("text", data_files="data.txt") | |
| def tok(x): | |
| return tokenizer(x["text"], truncation=True, max_length=128) | |
| ds = ds.map(tok, batched=True, remove_columns=["text"]) | |
| args = TrainingArguments( | |
| output_dir="out", | |
| num_train_epochs=1, | |
| per_device_train_batch_size=2, | |
| logging_steps=20, | |
| save_steps=500, | |
| report_to="none" | |
| ) | |
| Trainer(model=model, args=args, train_dataset=ds["train"]).train() | |
| model.save_pretrained("out") | |
| tokenizer.save_pretrained("out") | |