Hamses commited on
Commit
f9d6479
·
verified ·
1 Parent(s): c8a9617

Create preprocess Dataset

Browse files
Files changed (1) hide show
  1. preprocess Dataset +15 -0
preprocess Dataset ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ # Load your custom dataset (ensure it's in the proper format)
4
+ dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'})
5
+
6
+ # Load the GPT-2 tokenizer
7
+ from transformers import GPT2Tokenizer
8
+
9
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
10
+
11
+ # Preprocess the dataset
12
+ def preprocess_function(examples):
13
+ return tokenizer(examples['text'], padding='max_length', truncation=True)
14
+
15
+ encoded_dataset = dataset.map(preprocess_function, batched=True)