Create preprocess Dataset
Browse files- preprocess Dataset +15 -0
preprocess Dataset
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
|
| 3 |
+
# Load your custom dataset (ensure it's in the proper format)
|
| 4 |
+
dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'})
|
| 5 |
+
|
| 6 |
+
# Load the GPT-2 tokenizer
|
| 7 |
+
from transformers import GPT2Tokenizer
|
| 8 |
+
|
| 9 |
+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
| 10 |
+
|
| 11 |
+
# Preprocess the dataset
|
| 12 |
+
def preprocess_function(examples):
|
| 13 |
+
return tokenizer(examples['text'], padding='max_length', truncation=True)
|
| 14 |
+
|
| 15 |
+
encoded_dataset = dataset.map(preprocess_function, batched=True)
|