Hamses's picture
Update README.md
71a06c1 verified
---
license: cc0-1.0
datasets:
- Hamses/EU_Regulation_261_2004
language:
- en
library_name: transformers
pipeline_tag: text-generation
tags:
- legal
---
pip install transformers datasets torch
from datasets import load_dataset
# Load your custom dataset (ensure it's in the proper format)
dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})
# Load the GPT-2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Preprocess the dataset
def preprocess_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test']
)
# Train the model
trainer.train()
# Evaluate the model
results = trainer.evaluate()
print(results)
# Save the model
model.save_pretrained('./gpt2-finetuned')
tokenizer.save_pretrained('./gpt2-finetuned')