Rafael Camargo commited on
Commit
94aa96e
·
1 Parent(s): da3a6cf

chore: improve comments and remove unnecessary blank lines

Browse files
src/prediction.py CHANGED
@@ -6,8 +6,8 @@ from services.model import load_model, get_device
6
  # Initialize tokenizer
7
  _tokenizer = tiktoken.get_encoding("cl100k_base")
8
 
 
9
  def generate_word(words, model, vocab, inv_vocab, max_length=64):
10
- """Generate an imaginary word and its definition from three input words."""
11
  device = get_device()
12
 
13
  # Tokenize input words
 
6
  # Initialize tokenizer
7
  _tokenizer = tiktoken.get_encoding("cl100k_base")
8
 
9
+ # Generate an imaginary word and its definition from three input words.
10
  def generate_word(words, model, vocab, inv_vocab, max_length=64):
 
11
  device = get_device()
12
 
13
  # Tokenize input words
src/services/model.py CHANGED
@@ -13,14 +13,8 @@ _VOCAB_PATH = os.path.join(_VOCAB_DIR, "vocab.json")
13
  # Internal device selection
14
  _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
15
 
 
16
  def save_model(model, vocab):
17
- """
18
- Save the model state and vocabulary to disk.
19
-
20
- Args:
21
- model: The trained transformer model
22
- vocab: The vocabulary dictionary
23
- """
24
  # Create necessary directories
25
  os.makedirs(_MODEL_DIR, exist_ok=True)
26
  os.makedirs(_VOCAB_DIR, exist_ok=True)
@@ -35,13 +29,8 @@ def save_model(model, vocab):
35
  print(f"Model saved to {_MODEL_PATH}")
36
  print(f"Vocabulary saved to {_VOCAB_PATH}")
37
 
 
38
  def load_model():
39
- """
40
- Load the model and its vocabulary from disk.
41
-
42
- Returns:
43
- tuple: (model, vocab, inv_vocab)
44
- """
45
  # Load vocabulary
46
  with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
47
  vocab = json.load(f)
@@ -54,11 +43,6 @@ def load_model():
54
 
55
  return model, vocab, inv_vocab
56
 
 
57
  def get_device():
58
- """
59
- Get the device being used for model operations.
60
-
61
- Returns:
62
- torch.device: The device being used
63
- """
64
  return _DEVICE
 
13
  # Internal device selection
14
  _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
15
 
16
+ # Save the model state and vocabulary to disk.
17
  def save_model(model, vocab):
 
 
 
 
 
 
 
18
  # Create necessary directories
19
  os.makedirs(_MODEL_DIR, exist_ok=True)
20
  os.makedirs(_VOCAB_DIR, exist_ok=True)
 
29
  print(f"Model saved to {_MODEL_PATH}")
30
  print(f"Vocabulary saved to {_VOCAB_PATH}")
31
 
32
+ # Load the model and its vocabulary from disk.
33
  def load_model():
 
 
 
 
 
 
34
  # Load vocabulary
35
  with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
36
  vocab = json.load(f)
 
43
 
44
  return model, vocab, inv_vocab
45
 
46
+ # Get the device being used for model operations.
47
  def get_device():
 
 
 
 
 
 
48
  return _DEVICE
src/services/tokenizer.py CHANGED
@@ -4,8 +4,8 @@ from constants.tokens import special_tokens
4
  # Private tokenizer instance (internal use only)
5
  _tokenizer = tiktoken.get_encoding("cl100k_base")
6
 
 
7
  def tokenize_dataset(data):
8
- """Tokenize keys and values using the internal tokenizer."""
9
  inputs = []
10
  outputs = []
11
  for key, value in data.items():
@@ -15,8 +15,8 @@ def tokenize_dataset(data):
15
  outputs.append(out_tokens)
16
  return inputs, outputs
17
 
 
18
  def build_vocab(inputs, outputs):
19
- """Build vocabulary mapping from token IDs and add special tokens."""
20
  offset = len(special_tokens)
21
  all_ids = set(tok for seq in inputs + outputs for tok in seq)
22
  vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
 
4
  # Private tokenizer instance (internal use only)
5
  _tokenizer = tiktoken.get_encoding("cl100k_base")
6
 
7
+ # Tokenize keys and values using the internal tokenizer.
8
  def tokenize_dataset(data):
 
9
  inputs = []
10
  outputs = []
11
  for key, value in data.items():
 
15
  outputs.append(out_tokens)
16
  return inputs, outputs
17
 
18
+ # Build vocabulary mapping from token IDs and add special tokens.
19
  def build_vocab(inputs, outputs):
 
20
  offset = len(special_tokens)
21
  all_ids = set(tok for seq in inputs + outputs for tok in seq)
22
  vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
src/services/transformer.py CHANGED
@@ -1,38 +1,28 @@
1
  import torch
2
  import torch.nn as nn
3
- # import torch.nn.functional as F
4
  from constants.tokens import PAD_ID
5
 
6
  class TinyTransformer(nn.Module):
7
  def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
8
  super().__init__()
9
- # self.pad_token_id = pad_token_id
10
-
11
  self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
12
  self.pos_encoder = PositionalEncoding(d_model, dropout)
13
-
14
  encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
15
  self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
16
-
17
  decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
18
  self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
19
-
20
  self.out = nn.Linear(d_model, vocab_size)
21
 
 
22
  def forward(self, src, tgt):
23
- # Keep tensors in batch-first format
24
  tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
25
-
26
  src_emb = self.pos_encoder(self.embedding(src))
27
  tgt_emb = self.pos_encoder(self.embedding(tgt))
28
-
29
  # Create padding masks
30
  src_padding_mask = (src == PAD_ID).bool()
31
  tgt_padding_mask = (tgt == PAD_ID).bool()
32
-
33
  memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
34
  output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
35
-
36
  return self.out(output) # (batch, seq_len, vocab)
37
 
38
  def generate_src_mask(self, size):
@@ -42,7 +32,6 @@ class PositionalEncoding(nn.Module):
42
  def __init__(self, d_model, dropout=0.1, max_len=512):
43
  super().__init__()
44
  self.dropout = nn.Dropout(p=dropout)
45
-
46
  position = torch.arange(0, max_len).unsqueeze(1)
47
  div_term = torch.exp(
48
  torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
@@ -50,7 +39,6 @@ class PositionalEncoding(nn.Module):
50
  pe = torch.zeros(max_len, d_model)
51
  pe[:, 0::2] = torch.sin(position * div_term) # even indices
52
  pe[:, 1::2] = torch.cos(position * div_term) # odd indices
53
-
54
  self.register_buffer('pe', pe.unsqueeze(0))
55
 
56
  def forward(self, x):
 
1
  import torch
2
  import torch.nn as nn
 
3
  from constants.tokens import PAD_ID
4
 
5
  class TinyTransformer(nn.Module):
6
  def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
7
  super().__init__()
 
 
8
  self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
9
  self.pos_encoder = PositionalEncoding(d_model, dropout)
 
10
  encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
11
  self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
 
12
  decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
13
  self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
 
14
  self.out = nn.Linear(d_model, vocab_size)
15
 
16
+ # Keep tensors in batch-first format
17
  def forward(self, src, tgt):
 
18
  tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
 
19
  src_emb = self.pos_encoder(self.embedding(src))
20
  tgt_emb = self.pos_encoder(self.embedding(tgt))
 
21
  # Create padding masks
22
  src_padding_mask = (src == PAD_ID).bool()
23
  tgt_padding_mask = (tgt == PAD_ID).bool()
 
24
  memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
25
  output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
 
26
  return self.out(output) # (batch, seq_len, vocab)
27
 
28
  def generate_src_mask(self, size):
 
32
  def __init__(self, d_model, dropout=0.1, max_len=512):
33
  super().__init__()
34
  self.dropout = nn.Dropout(p=dropout)
 
35
  position = torch.arange(0, max_len).unsqueeze(1)
36
  div_term = torch.exp(
37
  torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
 
39
  pe = torch.zeros(max_len, d_model)
40
  pe[:, 0::2] = torch.sin(position * div_term) # even indices
41
  pe[:, 1::2] = torch.cos(position * div_term) # odd indices
 
42
  self.register_buffer('pe', pe.unsqueeze(0))
43
 
44
  def forward(self, x):