Rafael Camargo
commited on
Commit
·
94aa96e
1
Parent(s):
da3a6cf
chore: improve comments and remove unnecessary blank lines
Browse files- src/prediction.py +1 -1
- src/services/model.py +3 -19
- src/services/tokenizer.py +2 -2
- src/services/transformer.py +1 -13
src/prediction.py
CHANGED
|
@@ -6,8 +6,8 @@ from services.model import load_model, get_device
|
|
| 6 |
# Initialize tokenizer
|
| 7 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 8 |
|
|
|
|
| 9 |
def generate_word(words, model, vocab, inv_vocab, max_length=64):
|
| 10 |
-
"""Generate an imaginary word and its definition from three input words."""
|
| 11 |
device = get_device()
|
| 12 |
|
| 13 |
# Tokenize input words
|
|
|
|
| 6 |
# Initialize tokenizer
|
| 7 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 8 |
|
| 9 |
+
# Generate an imaginary word and its definition from three input words.
|
| 10 |
def generate_word(words, model, vocab, inv_vocab, max_length=64):
|
|
|
|
| 11 |
device = get_device()
|
| 12 |
|
| 13 |
# Tokenize input words
|
src/services/model.py
CHANGED
|
@@ -13,14 +13,8 @@ _VOCAB_PATH = os.path.join(_VOCAB_DIR, "vocab.json")
|
|
| 13 |
# Internal device selection
|
| 14 |
_DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 15 |
|
|
|
|
| 16 |
def save_model(model, vocab):
|
| 17 |
-
"""
|
| 18 |
-
Save the model state and vocabulary to disk.
|
| 19 |
-
|
| 20 |
-
Args:
|
| 21 |
-
model: The trained transformer model
|
| 22 |
-
vocab: The vocabulary dictionary
|
| 23 |
-
"""
|
| 24 |
# Create necessary directories
|
| 25 |
os.makedirs(_MODEL_DIR, exist_ok=True)
|
| 26 |
os.makedirs(_VOCAB_DIR, exist_ok=True)
|
|
@@ -35,13 +29,8 @@ def save_model(model, vocab):
|
|
| 35 |
print(f"Model saved to {_MODEL_PATH}")
|
| 36 |
print(f"Vocabulary saved to {_VOCAB_PATH}")
|
| 37 |
|
|
|
|
| 38 |
def load_model():
|
| 39 |
-
"""
|
| 40 |
-
Load the model and its vocabulary from disk.
|
| 41 |
-
|
| 42 |
-
Returns:
|
| 43 |
-
tuple: (model, vocab, inv_vocab)
|
| 44 |
-
"""
|
| 45 |
# Load vocabulary
|
| 46 |
with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
|
| 47 |
vocab = json.load(f)
|
|
@@ -54,11 +43,6 @@ def load_model():
|
|
| 54 |
|
| 55 |
return model, vocab, inv_vocab
|
| 56 |
|
|
|
|
| 57 |
def get_device():
|
| 58 |
-
"""
|
| 59 |
-
Get the device being used for model operations.
|
| 60 |
-
|
| 61 |
-
Returns:
|
| 62 |
-
torch.device: The device being used
|
| 63 |
-
"""
|
| 64 |
return _DEVICE
|
|
|
|
| 13 |
# Internal device selection
|
| 14 |
_DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 15 |
|
| 16 |
+
# Save the model state and vocabulary to disk.
|
| 17 |
def save_model(model, vocab):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Create necessary directories
|
| 19 |
os.makedirs(_MODEL_DIR, exist_ok=True)
|
| 20 |
os.makedirs(_VOCAB_DIR, exist_ok=True)
|
|
|
|
| 29 |
print(f"Model saved to {_MODEL_PATH}")
|
| 30 |
print(f"Vocabulary saved to {_VOCAB_PATH}")
|
| 31 |
|
| 32 |
+
# Load the model and its vocabulary from disk.
|
| 33 |
def load_model():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Load vocabulary
|
| 35 |
with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
|
| 36 |
vocab = json.load(f)
|
|
|
|
| 43 |
|
| 44 |
return model, vocab, inv_vocab
|
| 45 |
|
| 46 |
+
# Get the device being used for model operations.
|
| 47 |
def get_device():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
return _DEVICE
|
src/services/tokenizer.py
CHANGED
|
@@ -4,8 +4,8 @@ from constants.tokens import special_tokens
|
|
| 4 |
# Private tokenizer instance (internal use only)
|
| 5 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 6 |
|
|
|
|
| 7 |
def tokenize_dataset(data):
|
| 8 |
-
"""Tokenize keys and values using the internal tokenizer."""
|
| 9 |
inputs = []
|
| 10 |
outputs = []
|
| 11 |
for key, value in data.items():
|
|
@@ -15,8 +15,8 @@ def tokenize_dataset(data):
|
|
| 15 |
outputs.append(out_tokens)
|
| 16 |
return inputs, outputs
|
| 17 |
|
|
|
|
| 18 |
def build_vocab(inputs, outputs):
|
| 19 |
-
"""Build vocabulary mapping from token IDs and add special tokens."""
|
| 20 |
offset = len(special_tokens)
|
| 21 |
all_ids = set(tok for seq in inputs + outputs for tok in seq)
|
| 22 |
vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
|
|
|
|
| 4 |
# Private tokenizer instance (internal use only)
|
| 5 |
_tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 6 |
|
| 7 |
+
# Tokenize keys and values using the internal tokenizer.
|
| 8 |
def tokenize_dataset(data):
|
|
|
|
| 9 |
inputs = []
|
| 10 |
outputs = []
|
| 11 |
for key, value in data.items():
|
|
|
|
| 15 |
outputs.append(out_tokens)
|
| 16 |
return inputs, outputs
|
| 17 |
|
| 18 |
+
# Build vocabulary mapping from token IDs and add special tokens.
|
| 19 |
def build_vocab(inputs, outputs):
|
|
|
|
| 20 |
offset = len(special_tokens)
|
| 21 |
all_ids = set(tok for seq in inputs + outputs for tok in seq)
|
| 22 |
vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
|
src/services/transformer.py
CHANGED
|
@@ -1,38 +1,28 @@
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
-
# import torch.nn.functional as F
|
| 4 |
from constants.tokens import PAD_ID
|
| 5 |
|
| 6 |
class TinyTransformer(nn.Module):
|
| 7 |
def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
|
| 8 |
super().__init__()
|
| 9 |
-
# self.pad_token_id = pad_token_id
|
| 10 |
-
|
| 11 |
self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
|
| 12 |
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
| 13 |
-
|
| 14 |
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 15 |
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
|
| 16 |
-
|
| 17 |
decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 18 |
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
|
| 19 |
-
|
| 20 |
self.out = nn.Linear(d_model, vocab_size)
|
| 21 |
|
|
|
|
| 22 |
def forward(self, src, tgt):
|
| 23 |
-
# Keep tensors in batch-first format
|
| 24 |
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
|
| 25 |
-
|
| 26 |
src_emb = self.pos_encoder(self.embedding(src))
|
| 27 |
tgt_emb = self.pos_encoder(self.embedding(tgt))
|
| 28 |
-
|
| 29 |
# Create padding masks
|
| 30 |
src_padding_mask = (src == PAD_ID).bool()
|
| 31 |
tgt_padding_mask = (tgt == PAD_ID).bool()
|
| 32 |
-
|
| 33 |
memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
|
| 34 |
output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
|
| 35 |
-
|
| 36 |
return self.out(output) # (batch, seq_len, vocab)
|
| 37 |
|
| 38 |
def generate_src_mask(self, size):
|
|
@@ -42,7 +32,6 @@ class PositionalEncoding(nn.Module):
|
|
| 42 |
def __init__(self, d_model, dropout=0.1, max_len=512):
|
| 43 |
super().__init__()
|
| 44 |
self.dropout = nn.Dropout(p=dropout)
|
| 45 |
-
|
| 46 |
position = torch.arange(0, max_len).unsqueeze(1)
|
| 47 |
div_term = torch.exp(
|
| 48 |
torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
|
|
@@ -50,7 +39,6 @@ class PositionalEncoding(nn.Module):
|
|
| 50 |
pe = torch.zeros(max_len, d_model)
|
| 51 |
pe[:, 0::2] = torch.sin(position * div_term) # even indices
|
| 52 |
pe[:, 1::2] = torch.cos(position * div_term) # odd indices
|
| 53 |
-
|
| 54 |
self.register_buffer('pe', pe.unsqueeze(0))
|
| 55 |
|
| 56 |
def forward(self, x):
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
|
|
|
| 3 |
from constants.tokens import PAD_ID
|
| 4 |
|
| 5 |
class TinyTransformer(nn.Module):
|
| 6 |
def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
|
| 7 |
super().__init__()
|
|
|
|
|
|
|
| 8 |
self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
|
| 9 |
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
|
|
|
| 10 |
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 11 |
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
|
|
|
|
| 12 |
decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
|
| 13 |
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
|
|
|
|
| 14 |
self.out = nn.Linear(d_model, vocab_size)
|
| 15 |
|
| 16 |
+
# Keep tensors in batch-first format
|
| 17 |
def forward(self, src, tgt):
|
|
|
|
| 18 |
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
|
|
|
|
| 19 |
src_emb = self.pos_encoder(self.embedding(src))
|
| 20 |
tgt_emb = self.pos_encoder(self.embedding(tgt))
|
|
|
|
| 21 |
# Create padding masks
|
| 22 |
src_padding_mask = (src == PAD_ID).bool()
|
| 23 |
tgt_padding_mask = (tgt == PAD_ID).bool()
|
|
|
|
| 24 |
memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
|
| 25 |
output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
|
|
|
|
| 26 |
return self.out(output) # (batch, seq_len, vocab)
|
| 27 |
|
| 28 |
def generate_src_mask(self, size):
|
|
|
|
| 32 |
def __init__(self, d_model, dropout=0.1, max_len=512):
|
| 33 |
super().__init__()
|
| 34 |
self.dropout = nn.Dropout(p=dropout)
|
|
|
|
| 35 |
position = torch.arange(0, max_len).unsqueeze(1)
|
| 36 |
div_term = torch.exp(
|
| 37 |
torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
|
|
|
|
| 39 |
pe = torch.zeros(max_len, d_model)
|
| 40 |
pe[:, 0::2] = torch.sin(position * div_term) # even indices
|
| 41 |
pe[:, 1::2] = torch.cos(position * div_term) # odd indices
|
|
|
|
| 42 |
self.register_buffer('pe', pe.unsqueeze(0))
|
| 43 |
|
| 44 |
def forward(self, x):
|