|
|
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
|
|
|
|
|
|
class ScratchTokenizer:
|
|
|
def __init__(self):
|
|
|
self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
|
|
|
self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
|
|
|
self.vocab_size = 4
|
|
|
|
|
|
def build_vocab(self, texts):
|
|
|
for text in texts:
|
|
|
for word in text.split():
|
|
|
if word not in self.word2idx:
|
|
|
self.word2idx[word] = self.vocab_size
|
|
|
self.idx2word[self.vocab_size] = word
|
|
|
self.vocab_size += 1
|
|
|
|
|
|
def encode(self, text, max_len=200):
|
|
|
tokens = [self.word2idx.get(word, 3) for word in text.split()]
|
|
|
tokens = [1] + tokens[:max_len - 2] + [2]
|
|
|
return tokens + [0] * (max_len - len(tokens))
|
|
|
|
|
|
def decode(self, tokens):
|
|
|
return " ".join([self.idx2word.get(idx, "<UNK>") for idx in tokens if idx > 0])
|
|
|
|
|
|
|
|
|
class GPTModel(nn.Module):
|
|
|
def __init__(self, vocab_size, embed_size=256, num_heads=8, num_layers=6, max_len=200):
|
|
|
super(GPTModel, self).__init__()
|
|
|
self.embedding = nn.Embedding(vocab_size, embed_size)
|
|
|
self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_size))
|
|
|
self.transformer = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads), num_layers=num_layers)
|
|
|
self.fc_out = nn.Linear(embed_size, vocab_size)
|
|
|
|
|
|
def forward(self, src, tgt):
|
|
|
src_emb = self.embedding(src) + self.pos_embedding[:, :src.size(1), :]
|
|
|
tgt_emb = self.embedding(tgt) + self.pos_embedding[:, :tgt.size(1), :]
|
|
|
|
|
|
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
|
|
|
output = self.transformer(tgt_emb.permute(1, 0, 2), src_emb.permute(1, 0, 2), tgt_mask=tgt_mask)
|
|
|
return self.fc_out(output.permute(1, 0, 2))
|
|
|
|