Rafael Camargo
commited on
Commit
·
da3a6cf
1
Parent(s):
95de76e
chore: indent python files using 2 spaces as tab size
Browse files- src/constants/tokens.py +3 -3
- src/prediction.py +36 -36
- src/services/model.py +44 -44
- src/services/tokenizer.py +16 -16
- src/services/transformer.py +40 -40
- src/services/word_generation_dataset.py +18 -18
- src/training.py +19 -19
src/constants/tokens.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
special_tokens = {
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
}
|
| 6 |
|
| 7 |
PAD_ID = special_tokens["<pad>"]
|
|
|
|
| 1 |
special_tokens = {
|
| 2 |
+
'<pad>': 0,
|
| 3 |
+
'<sos>': 1,
|
| 4 |
+
'<eos>': 2,
|
| 5 |
}
|
| 6 |
|
| 7 |
PAD_ID = special_tokens["<pad>"]
|
src/prediction.py
CHANGED
|
@@ -7,43 +7,43 @@ from services.model import load_model, get_device
|
|
| 7 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 8 |
|
| 9 |
def generate_word(words, model, vocab, inv_vocab, max_length=64):
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
| 39 |
def main():
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
main()
|
|
|
|
| 7 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 8 |
|
| 9 |
def generate_word(words, model, vocab, inv_vocab, max_length=64):
|
| 10 |
+
"""Generate an imaginary word and its definition from three input words."""
|
| 11 |
+
device = get_device()
|
| 12 |
+
|
| 13 |
+
# Tokenize input words
|
| 14 |
+
input_text = ",".join(words)
|
| 15 |
+
input_tokens = _tokenizer.encode(input_text)
|
| 16 |
+
input_tensor = torch.tensor([vocab.get(str(tok), vocab["<pad>"]) for tok in input_tokens]).unsqueeze(0).to(device)
|
| 17 |
+
|
| 18 |
+
# Initialize target with SOS token
|
| 19 |
+
target = torch.tensor([[vocab["<sos>"]]]).to(device)
|
| 20 |
+
|
| 21 |
+
# Generate output
|
| 22 |
+
with torch.no_grad():
|
| 23 |
+
for _ in range(max_length):
|
| 24 |
+
output = model(input_tensor, target)
|
| 25 |
+
next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
|
| 26 |
+
|
| 27 |
+
# Stop if we predict EOS token
|
| 28 |
+
if next_token.item() == vocab["<eos>"]:
|
| 29 |
+
break
|
| 30 |
+
|
| 31 |
+
target = torch.cat([target, next_token], dim=1)
|
| 32 |
+
|
| 33 |
+
# Convert output tokens to text
|
| 34 |
+
output_tokens = target[0].cpu().numpy()
|
| 35 |
+
output_text = _tokenizer.decode([int(inv_vocab[tok]) for tok in output_tokens if tok not in special_tokens.values()])
|
| 36 |
+
|
| 37 |
+
return output_text
|
| 38 |
|
| 39 |
def main():
|
| 40 |
+
# Load model and vocabulary
|
| 41 |
+
model, vocab, inv_vocab = load_model()
|
| 42 |
+
|
| 43 |
+
# Example usage
|
| 44 |
+
words = ["muito", "grande", "imenso"]
|
| 45 |
+
result = generate_word(words, model, vocab, inv_vocab)
|
| 46 |
+
print(f"Input words: {', '.join(words)}")
|
| 47 |
+
print(f"Generated: {result}")
|
| 48 |
|
| 49 |
main()
|
src/services/model.py
CHANGED
|
@@ -14,51 +14,51 @@ _VOCAB_PATH = os.path.join(_VOCAB_DIR, "vocab.json")
|
|
| 14 |
_DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 15 |
|
| 16 |
def save_model(model, vocab):
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
| 38 |
def load_model():
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
def get_device():
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 14 |
_DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 15 |
|
| 16 |
def save_model(model, vocab):
|
| 17 |
+
"""
|
| 18 |
+
Save the model state and vocabulary to disk.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
model: The trained transformer model
|
| 22 |
+
vocab: The vocabulary dictionary
|
| 23 |
+
"""
|
| 24 |
+
# Create necessary directories
|
| 25 |
+
os.makedirs(_MODEL_DIR, exist_ok=True)
|
| 26 |
+
os.makedirs(_VOCAB_DIR, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
# Save model state
|
| 29 |
+
torch.save(model.state_dict(), _MODEL_PATH)
|
| 30 |
+
|
| 31 |
+
# Save vocabulary
|
| 32 |
+
with open(_VOCAB_PATH, "w", encoding="utf-8") as f:
|
| 33 |
+
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
| 34 |
+
|
| 35 |
+
print(f"Model saved to {_MODEL_PATH}")
|
| 36 |
+
print(f"Vocabulary saved to {_VOCAB_PATH}")
|
| 37 |
|
| 38 |
def load_model():
|
| 39 |
+
"""
|
| 40 |
+
Load the model and its vocabulary from disk.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
tuple: (model, vocab, inv_vocab)
|
| 44 |
+
"""
|
| 45 |
+
# Load vocabulary
|
| 46 |
+
with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
|
| 47 |
+
vocab = json.load(f)
|
| 48 |
+
inv_vocab = {int(v): k for k, v in vocab.items()}
|
| 49 |
+
|
| 50 |
+
# Initialize and load model
|
| 51 |
+
model = TinyTransformer(vocab_size=len(vocab)).to(_DEVICE)
|
| 52 |
+
model.load_state_dict(torch.load(_MODEL_PATH, map_location=_DEVICE))
|
| 53 |
+
model.eval()
|
| 54 |
+
|
| 55 |
+
return model, vocab, inv_vocab
|
| 56 |
|
| 57 |
def get_device():
|
| 58 |
+
"""
|
| 59 |
+
Get the device being used for model operations.
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
torch.device: The device being used
|
| 63 |
+
"""
|
| 64 |
+
return _DEVICE
|
src/services/tokenizer.py
CHANGED
|
@@ -5,21 +5,21 @@ from constants.tokens import special_tokens
|
|
| 5 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 6 |
|
| 7 |
def tokenize_dataset(data):
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
def build_vocab(inputs, outputs):
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
| 5 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 6 |
|
| 7 |
def tokenize_dataset(data):
|
| 8 |
+
"""Tokenize keys and values using the internal tokenizer."""
|
| 9 |
+
inputs = []
|
| 10 |
+
outputs = []
|
| 11 |
+
for key, value in data.items():
|
| 12 |
+
inp_tokens = _tokenizer.encode(key)
|
| 13 |
+
out_tokens = _tokenizer.encode(value)
|
| 14 |
+
inputs.append(inp_tokens)
|
| 15 |
+
outputs.append(out_tokens)
|
| 16 |
+
return inputs, outputs
|
| 17 |
|
| 18 |
def build_vocab(inputs, outputs):
|
| 19 |
+
"""Build vocabulary mapping from token IDs and add special tokens."""
|
| 20 |
+
offset = len(special_tokens)
|
| 21 |
+
all_ids = set(tok for seq in inputs + outputs for tok in seq)
|
| 22 |
+
vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
|
| 23 |
+
vocab.update({k: v for k, v in special_tokens.items()})
|
| 24 |
+
inv_vocab = {v: k for k, v in vocab.items()}
|
| 25 |
+
return vocab, inv_vocab
|
src/services/transformer.py
CHANGED
|
@@ -4,55 +4,55 @@ import torch.nn as nn
|
|
| 4 |
from constants.tokens import PAD_ID
|
| 5 |
|
| 6 |
class TinyTransformer(nn.Module):
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
-
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
class PositionalEncoding(nn.Module):
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 4 |
from constants.tokens import PAD_ID
|
| 5 |
|
| 6 |
class TinyTransformer(nn.Module):
|
| 7 |
+
def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
|
| 8 |
+
super().__init__()
|
| 9 |
+
# self.pad_token_id = pad_token_id
|
| 10 |
|
| 11 |
+
self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
|
| 12 |
+
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
| 13 |
|
| 14 |
+
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 15 |
+
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
|
| 16 |
|
| 17 |
+
decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 18 |
+
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
|
| 19 |
|
| 20 |
+
self.out = nn.Linear(d_model, vocab_size)
|
| 21 |
|
| 22 |
+
def forward(self, src, tgt):
|
| 23 |
+
# Keep tensors in batch-first format
|
| 24 |
+
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
|
| 25 |
|
| 26 |
+
src_emb = self.pos_encoder(self.embedding(src))
|
| 27 |
+
tgt_emb = self.pos_encoder(self.embedding(tgt))
|
| 28 |
|
| 29 |
+
# Create padding masks
|
| 30 |
+
src_padding_mask = (src == PAD_ID).bool()
|
| 31 |
+
tgt_padding_mask = (tgt == PAD_ID).bool()
|
| 32 |
|
| 33 |
+
memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
|
| 34 |
+
output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
|
| 35 |
|
| 36 |
+
return self.out(output) # (batch, seq_len, vocab)
|
| 37 |
|
| 38 |
+
def generate_src_mask(self, size):
|
| 39 |
+
return torch.zeros((size, size), device='cpu').type(torch.bool)
|
| 40 |
|
| 41 |
class PositionalEncoding(nn.Module):
|
| 42 |
+
def __init__(self, d_model, dropout=0.1, max_len=512):
|
| 43 |
+
super().__init__()
|
| 44 |
+
self.dropout = nn.Dropout(p=dropout)
|
| 45 |
+
|
| 46 |
+
position = torch.arange(0, max_len).unsqueeze(1)
|
| 47 |
+
div_term = torch.exp(
|
| 48 |
+
torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
|
| 49 |
+
)
|
| 50 |
+
pe = torch.zeros(max_len, d_model)
|
| 51 |
+
pe[:, 0::2] = torch.sin(position * div_term) # even indices
|
| 52 |
+
pe[:, 1::2] = torch.cos(position * div_term) # odd indices
|
| 53 |
+
|
| 54 |
+
self.register_buffer('pe', pe.unsqueeze(0))
|
| 55 |
+
|
| 56 |
+
def forward(self, x):
|
| 57 |
+
x = x + self.pe[:, :x.size(1), :].to(x.device)
|
| 58 |
+
return self.dropout(x)
|
src/services/word_generation_dataset.py
CHANGED
|
@@ -4,27 +4,27 @@ from torch.nn.utils.rnn import pad_sequence
|
|
| 4 |
from constants.tokens import PAD_ID
|
| 5 |
|
| 6 |
def encode_with_specials(token_ids, vocab, add_sos_eos=False):
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
|
| 11 |
class WordGenDataset(Dataset):
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
def collate_fn(batch):
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 4 |
from constants.tokens import PAD_ID
|
| 5 |
|
| 6 |
def encode_with_specials(token_ids, vocab, add_sos_eos=False):
|
| 7 |
+
if add_sos_eos:
|
| 8 |
+
return [vocab['<sos>']] + [vocab[t] for t in token_ids] + [vocab['<eos>']]
|
| 9 |
+
return [vocab[t] for t in token_ids]
|
| 10 |
|
| 11 |
class WordGenDataset(Dataset):
|
| 12 |
+
def __init__(self, inputs, outputs, vocab, max_len=64):
|
| 13 |
+
self.inputs = inputs
|
| 14 |
+
self.outputs = outputs
|
| 15 |
+
self.vocab = vocab
|
| 16 |
+
self.max_len = max_len
|
| 17 |
|
| 18 |
+
def __len__(self):
|
| 19 |
+
return len(self.inputs)
|
| 20 |
|
| 21 |
+
def __getitem__(self, idx):
|
| 22 |
+
x = encode_with_specials(self.inputs[idx], self.vocab)
|
| 23 |
+
y = encode_with_specials(self.outputs[idx], self.vocab, add_sos_eos=True)
|
| 24 |
+
return torch.tensor(x), torch.tensor(y)
|
| 25 |
|
| 26 |
def collate_fn(batch):
|
| 27 |
+
xs, ys = zip(*batch)
|
| 28 |
+
xs = pad_sequence(xs, batch_first=True, padding_value=PAD_ID)
|
| 29 |
+
ys = pad_sequence(ys, batch_first=True, padding_value=PAD_ID)
|
| 30 |
+
return xs, ys
|
src/training.py
CHANGED
|
@@ -49,32 +49,32 @@ optimizer = optim.Adam(model.parameters(), lr=1e-4)
|
|
| 49 |
num_epochs = 10
|
| 50 |
|
| 51 |
for epoch in range(num_epochs):
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
# 2.4 Save model
|
| 80 |
print("Saving...")
|
|
|
|
| 49 |
num_epochs = 10
|
| 50 |
|
| 51 |
for epoch in range(num_epochs):
|
| 52 |
+
model.train()
|
| 53 |
+
total_loss = 0
|
| 54 |
|
| 55 |
+
for batch in dataloader:
|
| 56 |
+
src, tgt = batch
|
| 57 |
+
src, tgt = src.to(device), tgt.to(device)
|
| 58 |
|
| 59 |
+
# Shift target to create input/target pairs
|
| 60 |
+
tgt_input = tgt[:, :-1]
|
| 61 |
+
tgt_expected = tgt[:, 1:]
|
| 62 |
|
| 63 |
+
# Forward pass
|
| 64 |
+
logits = model(src, tgt_input)
|
| 65 |
|
| 66 |
+
# Reshape for loss: (batch*seq_len, vocab_size)
|
| 67 |
+
loss = criterion(logits.reshape(-1, vocab_size), tgt_expected.reshape(-1))
|
| 68 |
|
| 69 |
+
# Backpropagation
|
| 70 |
+
optimizer.zero_grad()
|
| 71 |
+
loss.backward()
|
| 72 |
+
optimizer.step()
|
| 73 |
|
| 74 |
+
total_loss += loss.item()
|
| 75 |
|
| 76 |
+
avg_loss = total_loss / len(dataloader)
|
| 77 |
+
print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")
|
| 78 |
|
| 79 |
# 2.4 Save model
|
| 80 |
print("Saving...")
|