| from datasets import load_dataset |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
|
|
| |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| def tokenize_function(examples): |
| return tokenizer( |
| examples['Question'], |
| padding='max_length', |
| truncation=True, |
| max_length=128 |
| ) |
|
|
| |
| dataset = load_dataset('InnerI/synCAI_144kda') |
|
|
| |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
| |
| model = GPT2LMHeadModel.from_pretrained('gpt2-medium') |
|
|
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=r"InnerI/synCAI-144k-gpt2.5", |
| overwrite_output_dir=True, |
| num_train_epochs=1, |
| per_device_train_batch_size=4, |
| save_steps=10_000, |
| save_total_limit=2, |
| prediction_loss_only=True, |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| data_collator=data_collator, |
| train_dataset=tokenized_datasets['train'], |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| trainer.save_model(r"CAI-gpt2.5") |
|
|