File size: 1,631 Bytes
c8a9617
 
 
 
 
 
 
 
 
 
68f1d05
 
7b8aacd
 
 
b0946ec
 
7b8aacd
b0946ec
 
7b8aacd
b0946ec
7b8aacd
b0946ec
7b8aacd
 
 
 
b0946ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a06c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
---
license: cc0-1.0
datasets:
- Hamses/EU_Regulation_261_2004
language:
- en
library_name: transformers
pipeline_tag: text-generation
tags:
- legal
---
pip install transformers datasets torch

from datasets import load_dataset

# Load your custom dataset (ensure it's in the proper format)
dataset = load_dataset('Hamses/EU_Regulation_261_2004', data_files={'train': 'train.txt', 'test': 'test.txt'})

# Load the GPT-2 tokenizer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=encoded_dataset['train'],        
    eval_dataset=encoded_dataset['test'] 
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Save the model
model.save_pretrained('./gpt2-finetuned')
tokenizer.save_pretrained('./gpt2-finetuned')