chore: indent python files using 2 spaces as tab size

Files changed (7) hide show

src/constants/tokens.py +3 -3
src/prediction.py +36 -36
src/services/model.py +44 -44
src/services/tokenizer.py +16 -16
src/services/transformer.py +40 -40
src/services/word_generation_dataset.py +18 -18
src/training.py +19 -19

src/constants/tokens.py CHANGED Viewed

@@ -1,7 +1,7 @@
 special_tokens = {
-    '<pad>': 0,
-    '<sos>': 1,
-    '<eos>': 2,
 }
 PAD_ID = special_tokens["<pad>"]

 special_tokens = {
+  '<pad>': 0,
+  '<sos>': 1,
+  '<eos>': 2,
 }
 PAD_ID = special_tokens["<pad>"]

src/prediction.py CHANGED Viewed

@@ -7,43 +7,43 @@ from services.model import load_model, get_device
 _tokenizer = tiktoken.get_encoding("cl100k_base")
 def generate_word(words, model, vocab, inv_vocab, max_length=64):
-    """Generate an imaginary word and its definition from three input words."""
-    device = get_device()
-    # Tokenize input words
-    input_text = ",".join(words)
-    input_tokens = _tokenizer.encode(input_text)
-    input_tensor = torch.tensor([vocab.get(str(tok), vocab["<pad>"]) for tok in input_tokens]).unsqueeze(0).to(device)
-    # Initialize target with SOS token
-    target = torch.tensor([[vocab["<sos>"]]]).to(device)
-    # Generate output
-    with torch.no_grad():
-        for _ in range(max_length):
-            output = model(input_tensor, target)
-            next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
-            # Stop if we predict EOS token
-            if next_token.item() == vocab["<eos>"]:
-                break
-            target = torch.cat([target, next_token], dim=1)
-    # Convert output tokens to text
-    output_tokens = target[0].cpu().numpy()
-    output_text = _tokenizer.decode([int(inv_vocab[tok]) for tok in output_tokens if tok not in special_tokens.values()])
-    return output_text
 def main():
-    # Load model and vocabulary
-    model, vocab, inv_vocab = load_model()
-    # Example usage
-    words = ["muito", "grande", "imenso"]
-    result = generate_word(words, model, vocab, inv_vocab)
-    print(f"Input words: {', '.join(words)}")
-    print(f"Generated: {result}")
 main()

 _tokenizer = tiktoken.get_encoding("cl100k_base")
 def generate_word(words, model, vocab, inv_vocab, max_length=64):
+  """Generate an imaginary word and its definition from three input words."""
+  device = get_device()
+  # Tokenize input words
+  input_text = ",".join(words)
+  input_tokens = _tokenizer.encode(input_text)
+  input_tensor = torch.tensor([vocab.get(str(tok), vocab["<pad>"]) for tok in input_tokens]).unsqueeze(0).to(device)
+  # Initialize target with SOS token
+  target = torch.tensor([[vocab["<sos>"]]]).to(device)
+  # Generate output
+  with torch.no_grad():
+    for _ in range(max_length):
+      output = model(input_tensor, target)
+      next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
+      # Stop if we predict EOS token
+      if next_token.item() == vocab["<eos>"]:
+        break
+      target = torch.cat([target, next_token], dim=1)
+  # Convert output tokens to text
+  output_tokens = target[0].cpu().numpy()
+  output_text = _tokenizer.decode([int(inv_vocab[tok]) for tok in output_tokens if tok not in special_tokens.values()])
+  return output_text
 def main():
+  # Load model and vocabulary
+  model, vocab, inv_vocab = load_model()
+  # Example usage
+  words = ["muito", "grande", "imenso"]
+  result = generate_word(words, model, vocab, inv_vocab)
+  print(f"Input words: {', '.join(words)}")
+  print(f"Generated: {result}")
 main()

src/services/model.py CHANGED Viewed

@@ -14,51 +14,51 @@ _VOCAB_PATH = os.path.join(_VOCAB_DIR, "vocab.json")
 _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
 def save_model(model, vocab):
-    """
-    Save the model state and vocabulary to disk.
-    Args:
-        model: The trained transformer model
-        vocab: The vocabulary dictionary
-    """
-    # Create necessary directories
-    os.makedirs(_MODEL_DIR, exist_ok=True)
-    os.makedirs(_VOCAB_DIR, exist_ok=True)
-    # Save model state
-    torch.save(model.state_dict(), _MODEL_PATH)
-    # Save vocabulary
-    with open(_VOCAB_PATH, "w", encoding="utf-8") as f:
-        json.dump(vocab, f, ensure_ascii=False, indent=2)
-    print(f"Model saved to {_MODEL_PATH}")
-    print(f"Vocabulary saved to {_VOCAB_PATH}")
 def load_model():
-    """
-    Load the model and its vocabulary from disk.
-    Returns:
-        tuple: (model, vocab, inv_vocab)
-    """
-    # Load vocabulary
-    with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
-        vocab = json.load(f)
-    inv_vocab = {int(v): k for k, v in vocab.items()}
-    # Initialize and load model
-    model = TinyTransformer(vocab_size=len(vocab)).to(_DEVICE)
-    model.load_state_dict(torch.load(_MODEL_PATH, map_location=_DEVICE))
-    model.eval()
-    return model, vocab, inv_vocab
 def get_device():
-    """
-    Get the device being used for model operations.
-    Returns:
-        torch.device: The device being used
-    """
-    return _DEVICE

 _DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
 def save_model(model, vocab):
+  """
+  Save the model state and vocabulary to disk.
+  Args:
+    model: The trained transformer model
+    vocab: The vocabulary dictionary
+  """
+  # Create necessary directories
+  os.makedirs(_MODEL_DIR, exist_ok=True)
+  os.makedirs(_VOCAB_DIR, exist_ok=True)
+  # Save model state
+  torch.save(model.state_dict(), _MODEL_PATH)
+  # Save vocabulary
+  with open(_VOCAB_PATH, "w", encoding="utf-8") as f:
+    json.dump(vocab, f, ensure_ascii=False, indent=2)
+  print(f"Model saved to {_MODEL_PATH}")
+  print(f"Vocabulary saved to {_VOCAB_PATH}")
 def load_model():
+  """
+  Load the model and its vocabulary from disk.
+  Returns:
+    tuple: (model, vocab, inv_vocab)
+  """
+  # Load vocabulary
+  with open(_VOCAB_PATH, "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+  inv_vocab = {int(v): k for k, v in vocab.items()}
+  # Initialize and load model
+  model = TinyTransformer(vocab_size=len(vocab)).to(_DEVICE)
+  model.load_state_dict(torch.load(_MODEL_PATH, map_location=_DEVICE))
+  model.eval()
+  return model, vocab, inv_vocab
 def get_device():
+  """
+  Get the device being used for model operations.
+  Returns:
+    torch.device: The device being used
+  """
+  return _DEVICE

src/services/tokenizer.py CHANGED Viewed

@@ -5,21 +5,21 @@ from constants.tokens import special_tokens
 _tokenizer = tiktoken.get_encoding("cl100k_base")
 def tokenize_dataset(data):
-    """Tokenize keys and values using the internal tokenizer."""
-    inputs = []
-    outputs = []
-    for key, value in data.items():
-        inp_tokens = _tokenizer.encode(key)
-        out_tokens = _tokenizer.encode(value)
-        inputs.append(inp_tokens)
-        outputs.append(out_tokens)
-    return inputs, outputs
 def build_vocab(inputs, outputs):
-    """Build vocabulary mapping from token IDs and add special tokens."""
-    offset = len(special_tokens)
-    all_ids = set(tok for seq in inputs + outputs for tok in seq)
-    vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
-    vocab.update({k: v for k, v in special_tokens.items()})
-    inv_vocab = {v: k for k, v in vocab.items()}
-    return vocab, inv_vocab

 _tokenizer = tiktoken.get_encoding("cl100k_base")
 def tokenize_dataset(data):
+  """Tokenize keys and values using the internal tokenizer."""
+  inputs = []
+  outputs = []
+  for key, value in data.items():
+    inp_tokens = _tokenizer.encode(key)
+    out_tokens = _tokenizer.encode(value)
+    inputs.append(inp_tokens)
+    outputs.append(out_tokens)
+  return inputs, outputs
 def build_vocab(inputs, outputs):
+  """Build vocabulary mapping from token IDs and add special tokens."""
+  offset = len(special_tokens)
+  all_ids = set(tok for seq in inputs + outputs for tok in seq)
+  vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
+  vocab.update({k: v for k, v in special_tokens.items()})
+  inv_vocab = {v: k for k, v in vocab.items()}
+  return vocab, inv_vocab

src/services/transformer.py CHANGED Viewed

@@ -4,55 +4,55 @@ import torch.nn as nn
 from constants.tokens import PAD_ID
 class TinyTransformer(nn.Module):
-    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
-        super().__init__()
-        # self.pad_token_id = pad_token_id
-        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
-        self.pos_encoder = PositionalEncoding(d_model, dropout)
-        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
-        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
-        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
-        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
-        self.out = nn.Linear(d_model, vocab_size)
-    def forward(self, src, tgt):
-        # Keep tensors in batch-first format
-        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
-        src_emb = self.pos_encoder(self.embedding(src))
-        tgt_emb = self.pos_encoder(self.embedding(tgt))
-        # Create padding masks
-        src_padding_mask = (src == PAD_ID).bool()
-        tgt_padding_mask = (tgt == PAD_ID).bool()
-        memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
-        output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
-        return self.out(output)  # (batch, seq_len, vocab)
-    def generate_src_mask(self, size):
-        return torch.zeros((size, size), device='cpu').type(torch.bool)
 class PositionalEncoding(nn.Module):
-    def __init__(self, d_model, dropout=0.1, max_len=512):
-        super().__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
-        )
-        pe = torch.zeros(max_len, d_model)
-        pe[:, 0::2] = torch.sin(position * div_term)  # even indices
-        pe[:, 1::2] = torch.cos(position * div_term)  # odd indices
-        self.register_buffer('pe', pe.unsqueeze(0))
-    def forward(self, x):
-        x = x + self.pe[:, :x.size(1), :].to(x.device)
-        return self.dropout(x)

 from constants.tokens import PAD_ID
 class TinyTransformer(nn.Module):
+  def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, dropout=0.1):
+    super().__init__()
+    # self.pad_token_id = pad_token_id
+    self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
+    self.pos_encoder = PositionalEncoding(d_model, dropout)
+    encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
+    self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+    decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
+    self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
+    self.out = nn.Linear(d_model, vocab_size)
+  def forward(self, src, tgt):
+    # Keep tensors in batch-first format
+    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device).bool()
+    src_emb = self.pos_encoder(self.embedding(src))
+    tgt_emb = self.pos_encoder(self.embedding(tgt))
+    # Create padding masks
+    src_padding_mask = (src == PAD_ID).bool()
+    tgt_padding_mask = (tgt == PAD_ID).bool()
+    memory = self.encoder(src_emb, src_key_padding_mask=src_padding_mask)
+    output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
+    return self.out(output)  # (batch, seq_len, vocab)
+  def generate_src_mask(self, size):
+    return torch.zeros((size, size), device='cpu').type(torch.bool)
 class PositionalEncoding(nn.Module):
+  def __init__(self, d_model, dropout=0.1, max_len=512):
+    super().__init__()
+    self.dropout = nn.Dropout(p=dropout)
+    position = torch.arange(0, max_len).unsqueeze(1)
+    div_term = torch.exp(
+      torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)
+    )
+    pe = torch.zeros(max_len, d_model)
+    pe[:, 0::2] = torch.sin(position * div_term)  # even indices
+    pe[:, 1::2] = torch.cos(position * div_term)  # odd indices
+    self.register_buffer('pe', pe.unsqueeze(0))
+  def forward(self, x):
+    x = x + self.pe[:, :x.size(1), :].to(x.device)
+    return self.dropout(x)

src/services/word_generation_dataset.py CHANGED Viewed

@@ -4,27 +4,27 @@ from torch.nn.utils.rnn import pad_sequence
 from constants.tokens import PAD_ID
 def encode_with_specials(token_ids, vocab, add_sos_eos=False):
-    if add_sos_eos:
-        return [vocab['<sos>']] + [vocab[t] for t in token_ids] + [vocab['<eos>']]
-    return [vocab[t] for t in token_ids]
 class WordGenDataset(Dataset):
-    def __init__(self, inputs, outputs, vocab, max_len=64):
-        self.inputs = inputs
-        self.outputs = outputs
-        self.vocab = vocab
-        self.max_len = max_len
-    def __len__(self):
-        return len(self.inputs)
-    def __getitem__(self, idx):
-        x = encode_with_specials(self.inputs[idx], self.vocab)
-        y = encode_with_specials(self.outputs[idx], self.vocab, add_sos_eos=True)
-        return torch.tensor(x), torch.tensor(y)
 def collate_fn(batch):
-    xs, ys = zip(*batch)
-    xs = pad_sequence(xs, batch_first=True, padding_value=PAD_ID)
-    ys = pad_sequence(ys, batch_first=True, padding_value=PAD_ID)
-    return xs, ys

 from constants.tokens import PAD_ID
 def encode_with_specials(token_ids, vocab, add_sos_eos=False):
+  if add_sos_eos:
+    return [vocab['<sos>']] + [vocab[t] for t in token_ids] + [vocab['<eos>']]
+  return [vocab[t] for t in token_ids]
 class WordGenDataset(Dataset):
+  def __init__(self, inputs, outputs, vocab, max_len=64):
+    self.inputs = inputs
+    self.outputs = outputs
+    self.vocab = vocab
+    self.max_len = max_len
+  def __len__(self):
+    return len(self.inputs)
+  def __getitem__(self, idx):
+    x = encode_with_specials(self.inputs[idx], self.vocab)
+    y = encode_with_specials(self.outputs[idx], self.vocab, add_sos_eos=True)
+    return torch.tensor(x), torch.tensor(y)
 def collate_fn(batch):
+  xs, ys = zip(*batch)
+  xs = pad_sequence(xs, batch_first=True, padding_value=PAD_ID)
+  ys = pad_sequence(ys, batch_first=True, padding_value=PAD_ID)
+  return xs, ys

src/training.py CHANGED Viewed

@@ -49,32 +49,32 @@ optimizer = optim.Adam(model.parameters(), lr=1e-4)
 num_epochs = 10
 for epoch in range(num_epochs):
-    model.train()
-    total_loss = 0
-    for batch in dataloader:
-        src, tgt = batch
-        src, tgt = src.to(device), tgt.to(device)
-        # Shift target to create input/target pairs
-        tgt_input = tgt[:, :-1]
-        tgt_expected = tgt[:, 1:]
-        # Forward pass
-        logits = model(src, tgt_input)
-        # Reshape for loss: (batch*seq_len, vocab_size)
-        loss = criterion(logits.reshape(-1, vocab_size), tgt_expected.reshape(-1))
-        # Backpropagation
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        total_loss += loss.item()
-    avg_loss = total_loss / len(dataloader)
-    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")
 # 2.4 Save model
 print("Saving...")

 num_epochs = 10
 for epoch in range(num_epochs):
+  model.train()
+  total_loss = 0
+  for batch in dataloader:
+    src, tgt = batch
+    src, tgt = src.to(device), tgt.to(device)
+    # Shift target to create input/target pairs
+    tgt_input = tgt[:, :-1]
+    tgt_expected = tgt[:, 1:]
+    # Forward pass
+    logits = model(src, tgt_input)
+    # Reshape for loss: (batch*seq_len, vocab_size)
+    loss = criterion(logits.reshape(-1, vocab_size), tgt_expected.reshape(-1))
+    # Backpropagation
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    total_loss += loss.item()
+  avg_loss = total_loss / len(dataloader)
+  print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")
 # 2.4 Save model
 print("Saving...")