Benashael commited on
Commit
bb50cdb
·
1 Parent(s): b712ce1

Create fine_tune.py

Browse files
Files changed (1) hide show
  1. fine_tune.py +46 -0
fine_tune.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
2
+ from transformers import TextDataset, DataCollatorForLanguageModeling
3
+ from transformers import Trainer, TrainingArguments
4
+
5
+ # Load pre-trained GPT-2 model and tokenizer
6
+ model_name = "gpt2" # or "gpt2-medium", "gpt2-large", depending on your resources
7
+ model = GPT2LMHeadModel.from_pretrained(model_name)
8
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
9
+
10
+ # Load your dataset
11
+ train_dataset = TextDataset(
12
+ tokenizer=tokenizer,
13
+ file_path="path/to/your/dataset.txt",
14
+ block_size=128 # Adjust as needed
15
+ )
16
+
17
+ # Prepare data collator
18
+ data_collator = DataCollatorForLanguageModeling(
19
+ tokenizer=tokenizer,
20
+ mlm=False
21
+ )
22
+
23
+ # Define training arguments
24
+ training_args = TrainingArguments(
25
+ output_dir="./fine-tuned-gpt2",
26
+ overwrite_output_dir=True,
27
+ num_train_epochs=3, # Adjust as needed
28
+ per_device_train_batch_size=4, # Adjust based on GPU memory
29
+ save_steps=10_000, # Save model checkpoints
30
+ save_total_limit=2,
31
+ )
32
+
33
+ # Initialize Trainer
34
+ trainer = Trainer(
35
+ model=model,
36
+ args=training_args,
37
+ data_collator=data_collator,
38
+ train_dataset=train_dataset,
39
+ )
40
+
41
+ # Fine-tune the model
42
+ trainer.train()
43
+
44
+ # Save the fine-tuned model
45
+ model.save_pretrained("./fine-tuned-gpt2")
46
+ tokenizer.save_pretrained("./fine-tuned-gpt2")