Hamses
/

EU_Regulation_261_2004

Text Generation

Model card Files Files and versions

Hamses commited on Jul 21, 2024

Commit

43b3b94

·

verified ·

1 Parent(s): f9d6479

Update preprocess Dataset

Files changed (1) hide show

preprocess Dataset +2 -2

preprocess Dataset CHANGED Viewed

@@ -1,7 +1,7 @@
 from datasets import load_dataset
 # Load your custom dataset (ensure it's in the proper format)
-dataset = load_dataset('text', data_files={'train': 'train.txt', 'test': 'test.txt'})
 # Load the GPT-2 tokenizer
 from transformers import GPT2Tokenizer
@@ -10,6 +10,6 @@ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 # Preprocess the dataset
 def preprocess_function(examples):
-    return tokenizer(examples['text'], padding='max_length', truncation=True)
 encoded_dataset = dataset.map(preprocess_function, batched=True)

 from datasets import load_dataset
 # Load your custom dataset (ensure it's in the proper format)
+dataset = load_dataset('EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
 # Load the GPT-2 tokenizer
 from transformers import GPT2Tokenizer
 # Preprocess the dataset
 def preprocess_function(examples):
+    return tokenizer(examples['EU_Regulation_261_2004'], padding='max_length', truncation=True)
 encoded_dataset = dataset.map(preprocess_function, batched=True)