|
|
--- |
|
|
license: cc0-1.0 |
|
|
datasets: |
|
|
- Hamses/EU_Regulation_261_2004 |
|
|
language: |
|
|
- en |
|
|
library_name: transformers |
|
|
pipeline_tag: text-generation |
|
|
tags: |
|
|
- legal |
|
|
--- |
|
|
pip install transformers datasets torch |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
|
|
# Load your custom dataset (ensure it's in the proper format) |
|
|
dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'}) |
|
|
|
|
|
# Load the GPT-2 tokenizer |
|
|
from transformers import GPT2Tokenizer |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
|
|
|
|
# Preprocess the dataset |
|
|
def preprocess_function(examples): |
|
|
return tokenizer(examples['text'], padding='max_length', truncation=True) |
|
|
|
|
|
encoded_dataset = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer |
|
|
|
|
|
# Load the GPT-2 model |
|
|
model = GPT2LMHeadModel.from_pretrained('gpt2') |
|
|
|
|
|
# Define training arguments |
|
|
training_args = TrainingArguments( |
|
|
output_dir='./results', |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=4, |
|
|
per_device_eval_batch_size=4, |
|
|
warmup_steps=500, |
|
|
weight_decay=0.01, |
|
|
logging_dir='./logs', |
|
|
) |
|
|
|
|
|
# Initialize the Trainer |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=encoded_dataset['train'], |
|
|
eval_dataset=encoded_dataset['test'] |
|
|
) |
|
|
|
|
|
# Train the model |
|
|
trainer.train() |
|
|
|
|
|
# Evaluate the model |
|
|
results = trainer.evaluate() |
|
|
print(results) |
|
|
|
|
|
# Save the model |
|
|
model.save_pretrained('./gpt2-finetuned') |
|
|
tokenizer.save_pretrained('./gpt2-finetuned') |
|
|
|