File size: 1,631 Bytes
c8a9617 68f1d05 7b8aacd b0946ec 7b8aacd b0946ec 7b8aacd b0946ec 7b8aacd b0946ec 7b8aacd b0946ec 71a06c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
---
license: cc0-1.0
datasets:
- Hamses/EU_Regulation_261_2004
language:
- en
library_name: transformers
pipeline_tag: text-generation
tags:
- legal
---
pip install transformers datasets torch
from datasets import load_dataset
# Load your custom dataset (ensure it's in the proper format)
dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
# Load the GPT-2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Preprocess the dataset
def preprocess_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test']
)
# Train the model
trainer.train()
# Evaluate the model
results = trainer.evaluate()
print(results)
# Save the model
model.save_pretrained('./gpt2-finetuned')
tokenizer.save_pretrained('./gpt2-finetuned')
|