Update preprocess Dataset
Browse files- preprocess Dataset +2 -2
preprocess Dataset
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
# Load your custom dataset (ensure it's in the proper format)
|
| 4 |
-
dataset = load_dataset('
|
| 5 |
|
| 6 |
# Load the GPT-2 tokenizer
|
| 7 |
from transformers import GPT2Tokenizer
|
|
@@ -10,6 +10,6 @@ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
| 10 |
|
| 11 |
# Preprocess the dataset
|
| 12 |
def preprocess_function(examples):
|
| 13 |
-
return tokenizer(examples['
|
| 14 |
|
| 15 |
encoded_dataset = dataset.map(preprocess_function, batched=True)
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
# Load your custom dataset (ensure it's in the proper format)
|
| 4 |
+
dataset = load_dataset('EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
|
| 5 |
|
| 6 |
# Load the GPT-2 tokenizer
|
| 7 |
from transformers import GPT2Tokenizer
|
|
|
|
| 10 |
|
| 11 |
# Preprocess the dataset
|
| 12 |
def preprocess_function(examples):
|
| 13 |
+
return tokenizer(examples['EU_Regulation_261_2004'], padding='max_length', truncation=True)
|
| 14 |
|
| 15 |
encoded_dataset = dataset.map(preprocess_function, batched=True)
|