Rafael Camargo commited on
Commit
da3a6cf
·
1 Parent(s): 95de76e

chore: indent python files using 2 spaces as tab size

Browse files
src/constants/tokens.py CHANGED
@@ -1,7 +1,7 @@
1
  special_tokens = {
2
- '<pad>': 0,
3
- '<sos>': 1,
4
- '<eos>': 2,
5
  }
6
 
7
  PAD_ID = special_tokens["<pad>"]
 
1
  special_tokens = {
2
+ '<pad>': 0,
3
+ '<sos>': 1,
4
+ '<eos>': 2,
5
  }
6
 
7
  PAD_ID = special_tokens["<pad>"]
src/prediction.py CHANGED
@@ -7,43 +7,43 @@ from services.model import load_model, get_device
7
  _tokenizer = tiktoken.get_encoding("cl100k_base")
8
 
9
  def generate_word(words, model, vocab, inv_vocab, max_length=64):
10
- """Generate an imaginary word and its definition from three input words."""
11
- device = get_device()
12
-
13
- # Tokenize input words
14
- input_text = ",".join(words)
15
- input_tokens = _tokenizer.encode(input_text)
16
- input_tensor = torch.tensor([vocab.get(str(tok), vocab["<pad>"]) for tok in input_tokens]).unsqueeze(0).to(device)
17
-
18
- # Initialize target with SOS token
19
- target = torch.tensor([[vocab["<sos>"]]]).to(device)
20
-
21
- # Generate output
22
- with torch.no_grad():
23
- for _ in range(max_length):
24
- output = model(input_tensor, target)
25
- next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
26
-
27
- # Stop if we predict EOS token
28
- if next_token.item() == vocab["<eos>"]:
29
- break
30
-
31
- target = torch.cat([target, next_token], dim=1)
32
-
33
- # Convert output tokens to text
34
- output_tokens = target[0].cpu().numpy()
35
- output_text = _tokenizer.decode([int(inv_vocab[tok]) for tok in output_tokens if tok not in special_tokens.values()])
36
-
37
- return output_text
38
 
39
  def main():
40
- # Load model and vocabulary
41
- model, vocab, inv_vocab = load_model()
42
-
43
- # Example usage
44
- words = ["muito", "grande", "imenso"]
45
- result = generate_word(words, model, vocab, inv_vocab)
46
- print(f"Input words: {', '.join(words)}")
47
- print(f"Generated: {result}")
48
 
49
  main()
 
7
  _tokenizer = tiktoken.get_encoding("cl100k_base")
8
 
9
  def generate_word(words, model, vocab, inv_vocab, max_length=64):
10
+ """Generate an imaginary word and its definition from three input words."""
11
+ device = get_device()
12
+
13
+ # Tokenize input words
14
+ input_text = ",".join(words)
15
+ input_tokens = _tokenizer.encode(input_text)
16
+ input_tensor = torch.tensor([vocab.get(str(tok), vocab["<pad>"]) for tok in input_tokens]).unsqueeze(0).to(device)
17
+
18
+ # Initialize target with SOS token
19
+ target = torch.tensor([[vocab["<sos>"]]]).to(device)
20
+
21
+ # Generate output
22
+ with torch.no_grad():
23
+ for _ in range(max_length):
24
+ output = model(input_tensor, target)
25
+ next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
26
+
27
+ # Stop if we predict EOS token
28
+ if next_token.item() == vocab["<eos>"]:
29
+ break
30
+
31
+ target = torch.cat([target, next_token], dim=1)
32
+
33
+ # Convert output tokens to text
34
+ output_tokens = target[0].cpu().numpy()
35
+ output_text = _tokenizer.decode([int(inv_vocab[tok]) for tok in output_tokens if tok not in special_tokens.values()])
36
+
37
+ return output_text
38
 
39
  def main():
40
+ # Load model and vocabulary
41
+ model, vocab, inv_vocab = load_model()
42
+
43
+ # Example usage
44
+ words = ["muito", "grande", "imenso"]
45
+ result = generate_word(words, model, vocab, inv_vocab)
46
+ print(f"Input words: {', '.join(words)}")
47
+ print(f"Generated: {result}")
48
 
49
  main()
src/services/model.py CHANGED
@@ -14,51 +14,51 @@ _VOCAB_PATH = os.path.join(_VOCAB_DIR, "vocab.json")
14
  _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
15
 
16
  def save_model(model, vocab):
17
- """
18
- Save the model state and vocabulary to disk.
19
-
20
- Args:
21
- model: The trained transformer model
22
- vocab: The vocabulary dictionary
23
- """
24
- # Create necessary directories
25
- os.makedirs(_MODEL_DIR, exist_ok=True)
26
- os.makedirs(_VOCAB_DIR, exist_ok=True)
27
-
28
- # Save model state
29
- torch.save(model.state_dict(), _MODEL_PATH)
30
-
31
- # Save vocabulary
32
- with open(_VOCAB_PATH, "w", encoding="utf-8") as f:
33
- json.dump(vocab, f, ensure_ascii=False, indent=2)
34
-
35
- print(f"Model saved to {_MODEL_PATH}")
36
- print(f"Vocabulary saved to {_VOCAB_PATH}")
37
 
38
  def load_model():
39
- """
40
- Load the model and its vocabulary from disk.
41
-
42
- Returns:
43
- tuple: (model, vocab, inv_vocab)
44
- """
45
- # Load vocabulary
46
- with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
47
- vocab = json.load(f)
48
- inv_vocab = {int(v): k for k, v in vocab.items()}
49
-
50
- # Initialize and load model
51
- model = TinyTransformer(vocab_size=len(vocab)).to(_DEVICE)
52
- model.load_state_dict(torch.load(_MODEL_PATH, map_location=_DEVICE))
53
- model.eval()
54
-
55
- return model, vocab, inv_vocab
56
 
57
  def get_device():
58
- """
59
- Get the device being used for model operations.
60
-
61
- Returns:
62
- torch.device: The device being used
63
- """
64
- return _DEVICE
 
14
  _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
15
 
16
  def save_model(model, vocab):
17
+ """
18
+ Save the model state and vocabulary to disk.
19
+
20
+ Args:
21
+ model: The trained transformer model
22
+ vocab: The vocabulary dictionary
23
+ """
24
+ # Create necessary directories
25
+ os.makedirs(_MODEL_DIR, exist_ok=True)
26
+ os.makedirs(_VOCAB_DIR, exist_ok=True)
27
+
28
+ # Save model state
29
+ torch.save(model.state_dict(), _MODEL_PATH)
30
+
31
+ # Save vocabulary
32
+ with open(_VOCAB_PATH, "w", encoding="utf-8") as f:
33
+ json.dump(vocab, f, ensure_ascii=False, indent=2)
34
+
35
+ print(f"Model saved to {_MODEL_PATH}")
36
+ print(f"Vocabulary saved to {_VOCAB_PATH}")
37
 
38
  def load_model():
39
+ """
40
+ Load the model and its vocabulary from disk.
41
+
42
+ Returns:
43
+ tuple: (model, vocab, inv_vocab)
44
+ """
45
+ # Load vocabulary
46
+ with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
47
+ vocab = json.load(f)
48
+ inv_vocab = {int(v): k for k, v in vocab.items()}
49
+
50
+ # Initialize and load model
51
+ model = TinyTransformer(vocab_size=len(vocab)).to(_DEVICE)
52
+ model.load_state_dict(torch.load(_MODEL_PATH, map_location=_DEVICE))
53
+ model.eval()
54
+
55
+ return model, vocab, inv_vocab
56
 
57
  def get_device():
58
+ """
59
+ Get the device being used for model operations.
60
+
61
+ Returns:
62
+ torch.device: The device being used
63
+ """
64
+ return _DEVICE
src/services/tokenizer.py CHANGED
@@ -5,21 +5,21 @@ from constants.tokens import special_tokens
5
  _tokenizer = tiktoken.get_encoding("cl100k_base")
6
 
7
  def tokenize_dataset(data):
8
- """Tokenize keys and values using the internal tokenizer."""
9
- inputs = []
10
- outputs = []
11
- for key, value in data.items():
12
- inp_tokens = _tokenizer.encode(key)
13
- out_tokens = _tokenizer.encode(value)
14
- inputs.append(inp_tokens)
15
- outputs.append(out_tokens)
16
- return inputs, outputs
17
 
18
  def build_vocab(inputs, outputs):
19
- """Build vocabulary mapping from token IDs and add special tokens."""
20
- offset = len(special_tokens)
21
- all_ids = set(tok for seq in inputs + outputs for tok in seq)
22
- vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
23
- vocab.update({k: v for k, v in special_tokens.items()})
24
- inv_vocab = {v: k for k, v in vocab.items()}
25
- return vocab, inv_vocab
 
5
  _tokenizer = tiktoken.get_encoding("cl100k_base")
6
 
7
  def tokenize_dataset(data):
8
+ """Tokenize keys and values using the internal tokenizer."""
9
+ inputs = []
10
+ outputs = []
11
+ for key, value in data.items():
12
+ inp_tokens = _tokenizer.encode(key)
13
+ out_tokens = _tokenizer.encode(value)
14
+ inputs.append(inp_tokens)
15
+ outputs.append(out_tokens)
16
+ return inputs, outputs
17
 
18
  def build_vocab(inputs, outputs):
19
+ """Build vocabulary mapping from token IDs and add special tokens."""
20
+ offset = len(special_tokens)
21
+ all_ids = set(tok for seq in inputs + outputs for tok in seq)
22
+ vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
23
+ vocab.update({k: v for k, v in special_tokens.items()})
24
+ inv_vocab = {v: k for k, v in vocab.items()}
25
+ return vocab, inv_vocab
src/services/transformer.py CHANGED
@@ -4,55 +4,55 @@ import torch.nn as nn
4
  from constants.tokens import PAD_ID
5
 
6
  class TinyTransformer(nn.Module):
7
- def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
8
- super().__init__()
9
- # self.pad_token_id = pad_token_id
10
 
11
- self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
12
- self.pos_encoder = PositionalEncoding(d_model, dropout)
13
 
14
- encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
15
- self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
16
 
17
- decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
18
- self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
19
 
20
- self.out = nn.Linear(d_model, vocab_size)
21
 
22
- def forward(self, src, tgt):
23
- # Keep tensors in batch-first format
24
- tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
25
 
26
- src_emb = self.pos_encoder(self.embedding(src))
27
- tgt_emb = self.pos_encoder(self.embedding(tgt))
28
 
29
- # Create padding masks
30
- src_padding_mask = (src == PAD_ID).bool()
31
- tgt_padding_mask = (tgt == PAD_ID).bool()
32
 
33
- memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
34
- output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
35
 
36
- return self.out(output) # (batch, seq_len, vocab)
37
 
38
- def generate_src_mask(self, size):
39
- return torch.zeros((size, size), device='cpu').type(torch.bool)
40
 
41
  class PositionalEncoding(nn.Module):
42
- def __init__(self, d_model, dropout=0.1, max_len=512):
43
- super().__init__()
44
- self.dropout = nn.Dropout(p=dropout)
45
-
46
- position = torch.arange(0, max_len).unsqueeze(1)
47
- div_term = torch.exp(
48
- torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
49
- )
50
- pe = torch.zeros(max_len, d_model)
51
- pe[:, 0::2] = torch.sin(position * div_term) # even indices
52
- pe[:, 1::2] = torch.cos(position * div_term) # odd indices
53
-
54
- self.register_buffer('pe', pe.unsqueeze(0))
55
-
56
- def forward(self, x):
57
- x = x + self.pe[:, :x.size(1), :].to(x.device)
58
- return self.dropout(x)
 
4
  from constants.tokens import PAD_ID
5
 
6
  class TinyTransformer(nn.Module):
7
+ def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
8
+ super().__init__()
9
+ # self.pad_token_id = pad_token_id
10
 
11
+ self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
12
+ self.pos_encoder = PositionalEncoding(d_model, dropout)
13
 
14
+ encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
15
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
16
 
17
+ decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
18
+ self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
19
 
20
+ self.out = nn.Linear(d_model, vocab_size)
21
 
22
+ def forward(self, src, tgt):
23
+ # Keep tensors in batch-first format
24
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
25
 
26
+ src_emb = self.pos_encoder(self.embedding(src))
27
+ tgt_emb = self.pos_encoder(self.embedding(tgt))
28
 
29
+ # Create padding masks
30
+ src_padding_mask = (src == PAD_ID).bool()
31
+ tgt_padding_mask = (tgt == PAD_ID).bool()
32
 
33
+ memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
34
+ output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
35
 
36
+ return self.out(output) # (batch, seq_len, vocab)
37
 
38
+ def generate_src_mask(self, size):
39
+ return torch.zeros((size, size), device='cpu').type(torch.bool)
40
 
41
  class PositionalEncoding(nn.Module):
42
+ def __init__(self, d_model, dropout=0.1, max_len=512):
43
+ super().__init__()
44
+ self.dropout = nn.Dropout(p=dropout)
45
+
46
+ position = torch.arange(0, max_len).unsqueeze(1)
47
+ div_term = torch.exp(
48
+ torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
49
+ )
50
+ pe = torch.zeros(max_len, d_model)
51
+ pe[:, 0::2] = torch.sin(position * div_term) # even indices
52
+ pe[:, 1::2] = torch.cos(position * div_term) # odd indices
53
+
54
+ self.register_buffer('pe', pe.unsqueeze(0))
55
+
56
+ def forward(self, x):
57
+ x = x + self.pe[:, :x.size(1), :].to(x.device)
58
+ return self.dropout(x)
src/services/word_generation_dataset.py CHANGED
@@ -4,27 +4,27 @@ from torch.nn.utils.rnn import pad_sequence
4
  from constants.tokens import PAD_ID
5
 
6
  def encode_with_specials(token_ids, vocab, add_sos_eos=False):
7
- if add_sos_eos:
8
- return [vocab['<sos>']] + [vocab[t] for t in token_ids] + [vocab['<eos>']]
9
- return [vocab[t] for t in token_ids]
10
 
11
  class WordGenDataset(Dataset):
12
- def __init__(self, inputs, outputs, vocab, max_len=64):
13
- self.inputs = inputs
14
- self.outputs = outputs
15
- self.vocab = vocab
16
- self.max_len = max_len
17
 
18
- def __len__(self):
19
- return len(self.inputs)
20
 
21
- def __getitem__(self, idx):
22
- x = encode_with_specials(self.inputs[idx], self.vocab)
23
- y = encode_with_specials(self.outputs[idx], self.vocab, add_sos_eos=True)
24
- return torch.tensor(x), torch.tensor(y)
25
 
26
  def collate_fn(batch):
27
- xs, ys = zip(*batch)
28
- xs = pad_sequence(xs, batch_first=True, padding_value=PAD_ID)
29
- ys = pad_sequence(ys, batch_first=True, padding_value=PAD_ID)
30
- return xs, ys
 
4
  from constants.tokens import PAD_ID
5
 
6
  def encode_with_specials(token_ids, vocab, add_sos_eos=False):
7
+ if add_sos_eos:
8
+ return [vocab['<sos>']] + [vocab[t] for t in token_ids] + [vocab['<eos>']]
9
+ return [vocab[t] for t in token_ids]
10
 
11
  class WordGenDataset(Dataset):
12
+ def __init__(self, inputs, outputs, vocab, max_len=64):
13
+ self.inputs = inputs
14
+ self.outputs = outputs
15
+ self.vocab = vocab
16
+ self.max_len = max_len
17
 
18
+ def __len__(self):
19
+ return len(self.inputs)
20
 
21
+ def __getitem__(self, idx):
22
+ x = encode_with_specials(self.inputs[idx], self.vocab)
23
+ y = encode_with_specials(self.outputs[idx], self.vocab, add_sos_eos=True)
24
+ return torch.tensor(x), torch.tensor(y)
25
 
26
  def collate_fn(batch):
27
+ xs, ys = zip(*batch)
28
+ xs = pad_sequence(xs, batch_first=True, padding_value=PAD_ID)
29
+ ys = pad_sequence(ys, batch_first=True, padding_value=PAD_ID)
30
+ return xs, ys
src/training.py CHANGED
@@ -49,32 +49,32 @@ optimizer = optim.Adam(model.parameters(), lr=1e-4)
49
  num_epochs = 10
50
 
51
  for epoch in range(num_epochs):
52
- model.train()
53
- total_loss = 0
54
 
55
- for batch in dataloader:
56
- src, tgt = batch
57
- src, tgt = src.to(device), tgt.to(device)
58
 
59
- # Shift target to create input/target pairs
60
- tgt_input = tgt[:, :-1]
61
- tgt_expected = tgt[:, 1:]
62
 
63
- # Forward pass
64
- logits = model(src, tgt_input)
65
 
66
- # Reshape for loss: (batch*seq_len, vocab_size)
67
- loss = criterion(logits.reshape(-1, vocab_size), tgt_expected.reshape(-1))
68
 
69
- # Backpropagation
70
- optimizer.zero_grad()
71
- loss.backward()
72
- optimizer.step()
73
 
74
- total_loss += loss.item()
75
 
76
- avg_loss = total_loss / len(dataloader)
77
- print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")
78
 
79
  # 2.4 Save model
80
  print("Saving...")
 
49
  num_epochs = 10
50
 
51
  for epoch in range(num_epochs):
52
+ model.train()
53
+ total_loss = 0
54
 
55
+ for batch in dataloader:
56
+ src, tgt = batch
57
+ src, tgt = src.to(device), tgt.to(device)
58
 
59
+ # Shift target to create input/target pairs
60
+ tgt_input = tgt[:, :-1]
61
+ tgt_expected = tgt[:, 1:]
62
 
63
+ # Forward pass
64
+ logits = model(src, tgt_input)
65
 
66
+ # Reshape for loss: (batch*seq_len, vocab_size)
67
+ loss = criterion(logits.reshape(-1, vocab_size), tgt_expected.reshape(-1))
68
 
69
+ # Backpropagation
70
+ optimizer.zero_grad()
71
+ loss.backward()
72
+ optimizer.step()
73
 
74
+ total_loss += loss.item()
75
 
76
+ avg_loss = total_loss / len(dataloader)
77
+ print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")
78
 
79
  # 2.4 Save model
80
  print("Saving...")