| import os |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| from datasets import load_dataset |
|
|
| |
| model_name = "sshleifer/tiny-gpt2" |
| data_file = "data.txt" |
| output_dir = "./fine-tuned-tiny-gpt2" |
| block_size = 512 |
|
|
| |
| tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
|
| |
| if tokenizer.pad_token is None: |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
| model = GPT2LMHeadModel.from_pretrained(model_name) |
|
|
| |
| model.resize_token_embeddings(len(tokenizer)) |
|
|
| |
| def load_and_preprocess_dataset(file_path, tokenizer, block_size): |
| dataset = load_dataset('text', data_files=file_path, split='train') |
| |
| def tokenize_function(examples): |
| return tokenizer(examples['text'], truncation=True, max_length=block_size) |
| |
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text']) |
| return tokenized_dataset |
|
|
| |
| dataset = load_and_preprocess_dataset(data_file, tokenizer, block_size) |
|
|
| |
| split = dataset.train_test_split(test_size=0.1) |
| train_dataset = split['train'] |
| eval_dataset = split['test'] |
|
|
| |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| overwrite_output_dir=True, |
| num_train_epochs=5, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=16, |
| gradient_accumulation_steps=4, |
| fp16=True, |
| logging_dir=os.path.join(output_dir, 'logs'), |
| logging_steps=200, |
| save_steps=1000, |
| save_total_limit=3, |
| evaluation_strategy="steps", |
| eval_steps=1000, |
| load_best_model_at_end=True, |
| metric_for_best_model="loss", |
| greater_is_better=False, |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=data_collator, |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| trainer.save_model(output_dir) |
| tokenizer.save_pretrained(output_dir) |
|
|
| print(f"Model and tokenizer saved to {output_dir}") |
|
|