File size: 744 Bytes
62dbf75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

MODEL = "skt/kogpt2-base-v2"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)

ds = load_dataset("text", data_files="data.txt")

def tok(x):
    return tokenizer(x["text"], truncation=True, max_length=128)

ds = ds.map(tok, batched=True, remove_columns=["text"])

args = TrainingArguments(
    output_dir="out",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_steps=20,
    save_steps=500,
    report_to="none"
)

Trainer(model=model, args=args, train_dataset=ds["train"]).train()

model.save_pretrained("out")
tokenizer.save_pretrained("out")