In [43]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [44]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([Lowercase()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [45]:
trainer = BpeTrainer(vocab_size = 50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=['', '', '','', ''])
tokenizer.train(["../../datasets/austen-emma.txt"], trainer)

In [46]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [47]:
from transformers import (GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel)
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")

In [48]:
tokenizer_gpt.add_special_tokens({
 "eos_token": "",
 "bos_token": "",
 "unk_token": "",
 "pad_token": "",
 "mask_token": ""
})

1

In [49]:
tokenizer_gpt.encode(" thisis ")

[0, 469, 293, 225, 2]

In [50]:
tokenizer_gpt.encode(" this is ")

[0, 469, 361, 225, 2]

In [51]:
config = GPT2Config(
 vocab_size = tokenizer_gpt.vocab_size,
 bos_token_id = tokenizer_gpt.bos_token_id,
 eos_token_id = tokenizer_gpt.eos_token_id
)

model = GPT2LMHeadModel(config)

In [52]:
config

GPT2Config {
 "activation_function": "gelu_new",
 "attn_pdrop": 0.1,
 "bos_token_id": 0,
 "embd_pdrop": 0.1,
 "eos_token_id": 2,
 "initializer_range": 0.02,
 "layer_norm_epsilon": 1e-05,
 "model_type": "gpt2",
 "n_embd": 768,
 "n_head": 12,
 "n_inner": null,
 "n_layer": 12,
 "n_positions": 1024,
 "reorder_and_upcast_attn": false,
 "resid_pdrop": 0.1,
 "scale_attn_by_inverse_layer_idx": false,
 "scale_attn_weights": true,
 "summary_activation": null,
 "summary_first_dropout": 0.1,
 "summary_proj_to_labels": true,
 "summary_type": "cls_index",
 "summary_use_proj": true,
 "transformers_version": "4.54.1",
 "use_cache": true,
 "vocab_size": 11954
}

In [53]:
model

GPT2LMHeadModel(
 (transformer): GPT2Model(
 (wte): Embedding(11954, 768)
 (wpe): Embedding(1024, 768)
 (drop): Dropout(p=0.1, inplace=False)
 (h): ModuleList(
 (0-11): 12 x GPT2Block(
 (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 (attn): GPT2Attention(
 (c_attn): Conv1D(nf=2304, nx=768)
 (c_proj): Conv1D(nf=768, nx=768)
 (attn_dropout): Dropout(p=0.1, inplace=False)
 (resid_dropout): Dropout(p=0.1, inplace=False)
 )
 (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 (mlp): GPT2MLP(
 (c_fc): Conv1D(nf=3072, nx=768)
 (c_proj): Conv1D(nf=768, nx=3072)
 (act): NewGELUActivation()
 (dropout): Dropout(p=0.1, inplace=False)
 )
 )
 )
 (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 )
 (lm_head): Linear(in_features=768, out_features=11954, bias=False)
)

In [54]:
with open("../../datasets/austen-emma.txt", "r", encoding='utf-8') as f:
 content = f.readlines()

In [55]:
content_p = []
for c in content:
 if len(c) > 10:
 content_p.append(c.strip())
content_p = ' '.join(content_p) + tokenizer_gpt.eos_token

In [56]:
tokenized_content = tokenizer_gpt.encode(content_p)

In [57]:
len(tokenized_content)

195221

In [58]:
sample_len = 100
examples = []
for i in range(0, len(tokenized_content) - sample_len + 1):
 examples.append(
 tokenized_content[i:i+ sample_len]
 )

train_data = []
labels = []
for example in examples:
 train_data.append(example[:-1])
 labels.append(example[1:])

In [59]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

buffer = 500
batch_size = 64

train_data = torch.Tensor(train_data).to(dtype=torch.long).cuda()
labels = torch.Tensor(labels).to(dtype=torch.long).cuda()
dataset = TensorDataset(train_data, labels)

loader = DataLoader(dataset, batch_size=batch_size, drop_last=True, shuffle=True)

In [60]:
from torch.optim import Adam
import torch.nn as nn
from tqdm import tqdm

model = model.cuda()
optimizer = Adam(model.parameters(), lr=3e-5, eps=1e-08)
criterion = nn.CrossEntropyLoss()

In [None]:
model.train()
# for epoch in tqdm(range(0, 1)):
losses = 0.
aucc = 0
cnt = 0
progress = tqdm(loader)
for x, y in progress:
 pred = model(x).logits
 
 y = F.one_hot(y, num_classes=tokenizer_gpt.vocab_size)
 
 loss = criterion(pred.to(dtype=torch.float32), y.to(dtype=torch.float32))
 optimizer.zero_grad()
 loss.backward()
 optimizer.step()
 
 pred = F.softmax(pred)
 y = torch.argmax(y, dim=2)
 aucc += torch.sum(pred == y)
 cnt += (batch_size*99)
 losses += loss
 progress.set_postfix_str(f"loss: {loss.cpu().detach().numpy():.04f}, aucc: {aucc.cpu()/cnt:.04f}")
 
print(losses / len(loader))
print(aucc / cnt)

 pred = F.softmax(pred)
100%|██████████| 3048/3048 [07:58<00:00, 6.37it/s, loss: 0.0061, aucc: 0.4009]

tensor(0.0085, device='cuda:0', grad_fn=)





TypeError: len() takes exactly one argument (0 given)

In [90]:
def generate(start, model):
 input_token_ids = tokenizer_gpt.encode(start, return_tensors='pt').cuda()
 output = model.generate(
 input_token_ids,
 max_length= 500,
 num_beams = 5,
 temperature=0.7,
 no_repeat_ngram_size=2,
 num_return_sequences=1
 )
 return tokenizer_gpt.decode(output[0])

In [92]:
generate(" ", model)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' yes!" cried emma.--"my dearest harriet, mr. weston, who had been walking away from hartfield, he would never have seen him before; and while she did not quite understand how it might be supposed that she could give up the idea of any body\'s coming to such a thing by his manners. "i am afraid," he replied, "that you must be the greatest pleasure. you do not know what your father would have heard." "oh! yes, my dear--but i dare say i am sure i shall think they will think you are very much obliged to be sure.--but this is an excellent miss smith, however, indeed; but there is being _you are quite enough to make one of course.--i can hardly ever hear of every thing to see nothing else." she comes in love with me, sir?--well--a very bad.--he is coming over this morning and yet quite complete in the whole." emma could not likely to call upon the same glance at all these words, if they walked on that sort of his feelings: i must take care about ten months ago, perhaps--what an old acquaint

In [93]:
from transformers import (WEIGHTS_NAME, CONFIG_NAME)

In [94]:
WEIGHTS_NAME

'pytorch_model.bin'

In [95]:
CONFIG_NAME

'config.json'

In [96]:
model.save_pretrained(WEIGHTS_NAME)

In [97]:
config.save_pretrained(CONFIG_NAME)