File size: 1,878 Bytes
e12b8c1 18491d4 e8d85ea e12b8c1 e8d85ea 18491d4 e8d85ea e12b8c1 e8d85ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
class TextDataset(Dataset):
def __init__(self, text, tokenizer):
self.tokenizer = tokenizer
self.input_ids = []
self.attn_masks = []
for i in range(0, len(text) - 1024 + 1, 1024): # GPT-Neo has a max length of 1024
inputs = tokenizer.encode_plus(text[i:i + 1024], truncation=True, max_length=1024, padding="max_length", return_tensors='pt')
self.input_ids.append(inputs['input_ids'])
self.attn_masks.append(inputs['attention_mask'])
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
class GPTNeoTrainer:
def __init__(self, model_name, dataset_path):
self.model = GPTNeoForCausalLM.from_pretrained(model_name)
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
with open(dataset_path, "r") as f:
data = f.read()
self.dataset = TextDataset(data, self.tokenizer)
self.training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=10,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
def train(self):
trainer = Trainer(
model=self.model,
args=self.training_args,
train_dataset=self.dataset,
)
trainer.train()
def save_model(self, output_dir):
self.model.save_pretrained(output_dir)
# ΠΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅ ΠΊΠ»Π°ΡΡΠ°
trainer = GPTNeoTrainer("EleutherAI/gpt-neo-1.3B", "dataset.txt")
trainer.train()
trainer.save_model("model_directory") |