Hamses's picture
Update README.md
71a06c1 verified
metadata
license: cc0-1.0
datasets:
  - Hamses/EU_Regulation_261_2004
language:
  - en
library_name: transformers
pipeline_tag: text-generation
tags:
  - legal

pip install transformers datasets torch

from datasets import load_dataset

Load your custom dataset (ensure it's in the proper format)

dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})

Load the GPT-2 tokenizer

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Preprocess the dataset

def preprocess_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

Load the GPT-2 model

model = GPT2LMHeadModel.from_pretrained('gpt2')

Define training arguments

training_args = TrainingArguments( output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)

Initialize the Trainer

trainer = Trainer( model=model,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test'] )

Train the model

trainer.train()

Evaluate the model

results = trainer.evaluate() print(results)

Save the model

model.save_pretrained('./gpt2-finetuned') tokenizer.save_pretrained('./gpt2-finetuned')