File size: 3,207 Bytes
0b0afda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
"""LLM.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1AbtqagXl-cWKhXqd5z_uxZ2_JiCI4e6q
"""

#  Step 1: Setup
# pip install transformers
# pip install datasets

# !pip install transformers[torch]

# !pip install transformers --upgrade
# !pip install accelerate --upgrade

import transformers
import accelerate
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

# Step 2: Mount Google Drive to access your data
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import set_caching_enabled
set_caching_enabled(False)

from datasets import load_dataset

# Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive
dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/Ethics.txt'})

# Split the 'train' dataset into training and validation sets
train_size = int(len(dataset['train']) * 0.9)
train_dataset = dataset['train'].select(list(range(train_size)))
validation_dataset = dataset['train'].select(list(range(train_size, len(dataset['train']))))

print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(validation_dataset))

# Step 4: Tokenization
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def tokenize_function(examples):
    return tokenizer(examples["text"])

# Tokenize the dataset with a reduced number of workers
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1  # Set the number of workers to 1
)

# !pip install gpt-2-simple

import gpt_2_simple as gpt2

# !pip install accelerate>=0.20.1
# !pip install accelerate -U

# Step 5: Model Preparation
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Step 6: Training
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Step 9: Save the Model
model.save_pretrained("fine_tuned_model")

# !pip install xformers

# # Step 7: Testing
# from transformers import pipeline
# generator = pipeline('text-generation', model=model)
# result = generator('My custom model says,')[0]
# print(result['generated_text'])
# Step 7: Testing
from transformers import pipeline, GPT2Tokenizer

# Create a tokenizer for your GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # You might need to adjust the model name

# Create a text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Generate text using the pipeline
result = generator('My custom model says,')[0]
print(result['generated_text'])