|
|
import os |
|
|
|
|
|
from datasets import Dataset |
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments |
|
|
|
|
|
|
|
|
data = [ |
|
|
{ |
|
|
"input": "¿Cómo se conocieron Emmanuel y Rebeca?", |
|
|
"output": "Rebeca y Emmanuel se conocieron en su primer día de trabajo cuando se estrecharon la mano..." |
|
|
}, |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
dataset = Dataset.from_dict({ |
|
|
'input': [entry['input'] for entry in data], |
|
|
'output': [entry['output'] for entry in data] |
|
|
}) |
|
|
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
|
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples['input'], padding='max_length', truncation=True, max_length=512) |
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
|
|
|
model = GPT2LMHeadModel.from_pretrained('gpt2') |
|
|
|
|
|
|
|
|
output_dir = "./app" |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=2, |
|
|
per_device_eval_batch_size=2, |
|
|
warmup_steps=500, |
|
|
weight_decay=0.01, |
|
|
logging_dir='./logs', |
|
|
logging_steps=10, |
|
|
save_steps=500, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_datasets, |
|
|
eval_dataset=tokenized_datasets, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainer.save_model('./app/gpt2-small-finetuned') |
|
|
|