Hamses commited on
Commit
43b3b94
·
verified ·
1 Parent(s): f9d6479

Update preprocess Dataset

Browse files
Files changed (1) hide show
  1. preprocess Dataset +2 -2
preprocess Dataset CHANGED
@@ -1,7 +1,7 @@
1
  from datasets import load_dataset
2
 
3
  # Load your custom dataset (ensure it's in the proper format)
4
- dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'})
5
 
6
  # Load the GPT-2 tokenizer
7
  from transformers import GPT2Tokenizer
@@ -10,6 +10,6 @@ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
10
 
11
  # Preprocess the dataset
12
  def preprocess_function(examples):
13
- return tokenizer(examples['text'], padding='max_length', truncation=True)
14
 
15
  encoded_dataset = dataset.map(preprocess_function, batched=True)
 
1
  from datasets import load_dataset
2
 
3
  # Load your custom dataset (ensure it's in the proper format)
4
+ dataset = load_dataset('EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
5
 
6
  # Load the GPT-2 tokenizer
7
  from transformers import GPT2Tokenizer
 
10
 
11
  # Preprocess the dataset
12
  def preprocess_function(examples):
13
+ return tokenizer(examples['EU_Regulation_261_2004'], padding='max_length', truncation=True)
14
 
15
  encoded_dataset = dataset.map(preprocess_function, batched=True)