Spaces:

AlainDeLong
/

Machine-Translation-En-Vi

Sleeping

App Files Files Community

AlainDeLong commited on Nov 24, 2025

Commit

e27ab6a

1 Parent(s): 1a0fc46

Create translate app

Browse files

Files changed (14) hide show

.gitignore +227 -0
Dockerfile +9 -1
requirements.txt +4 -1
src/callbacks.py +24 -0
src/config.py +70 -0
src/dataset.py +280 -0
src/embedding.py +105 -0
src/engine.py +278 -0
src/layers.py +186 -0
src/model.py +207 -0
src/modules.py +323 -0
src/streamlit_app.py +176 -38
src/tokenizer.py +156 -0
src/utils.py +375 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,227 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# # Image
+# images/
+# Dataset
+data/en-vi.txt/
+data/IWSLT'15 en-vi/
+notebooks/processed_data/
+notebooks/IWSLT-15-en-vi/
+# MLflow
+mlruns/
+# Temp Files
+scratch/
+notebooks/
+test_push_to_hub.ipynb
+# Weights & Biases
+wandb/

Dockerfile CHANGED Viewed

@@ -13,8 +13,16 @@ COPY src/ ./src/
 RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 RUN pip3 install -r requirements.txt
+RUN mkdir -p /app/hf_cache
+ENV HF_HOME="/app/hf_cache"
+RUN chmod -R 777 /app
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
+# ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 altair
 pandas
-streamlit

 altair
 pandas
+streamlit
+torch==2.6.0
+transformers==4.52.4
+jaxtyping

src/callbacks.py ADDED Viewed

	@@ -0,0 +1,24 @@

+class EarlyStopping:
+    def __init__(self, patience=5, min_delta=1e-4, verbose=True):
+        self.patience = patience
+        self.min_delta = min_delta
+        self.verbose = verbose
+        self.best_loss = float("inf")
+        self.counter = 0
+        self.should_stop = False
+    def step(self, val_loss):
+        # Check improvement
+        if val_loss < self.best_loss - self.min_delta:
+            self.best_loss = val_loss
+            self.counter = 0
+        else:
+            self.counter += 1
+        # Stop condition
+        if self.counter >= self.patience:
+            self.should_stop = True
+            if self.verbose:
+                print(
+                    f"[EarlyStopping] No improvement for {self.patience} epochs → stopping."
+                )

src/config.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from pathlib import Path
+import torch
+# Path Configuration
+DATA_PATH = Path(r"data\IWSLT-15-en-vi")
+# TOKENIZER_NAME = ""
+# TOKENIZER_NAME = "iwslt_en-vi_tokenizer_16k.json"
+TOKENIZER_NAME = "iwslt_en-vi_tokenizer_32k.json"
+TOKENIZER_PATH = Path(r"artifacts\tokenizers") / TOKENIZER_NAME
+MODEL_DIR = Path(r"artifacts\models")
+# MODEL_NAME = ""
+# MODEL_NAME = "transformer_en_vi_iwslt_1.pt"
+MODEL_NAME = "transformer_en_vi_iwslt_1.safetensors"
+# MODEL_SAVE_PATH = MODEL_DIR / MODEL_NAME
+MODEL_SAVE_PATH = MODEL_DIR / "transformer_en_vi_iwslt_kaggle_1.safetensors"
+# MODEL_SAVE_PATH = Path(r"notebooks\models") / MODEL_NAME
+CHECKPOINT_PATH = Path(r"artifacts\checkpoints") / MODEL_NAME
+CACHE_DIR = ""
+# Hardware & Data Config
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+NUM_WORKERS: int = 4
+VOCAB_SIZE: int = 32_000
+SPECIAL_TOKENS: list[str] = ["[PAD]", "[UNK]", "[SOS]", "[EOS]"]
+NUM_SAMPLES_TO_USE: int = 1000
+# NUM_SAMPLES_TO_USE: int = 1_000_000
+# Tokenizer Constants
+PAD_TOKEN_ID: int = 0
+UNK_TOKEN_ID: int = 1
+SOS_TOKEN_ID: int = 2
+EOS_TOKEN_ID: int = 3
+# Model Hyperparameters
+# D_MODEL: int = 256  # (Dimension of model)
+D_MODEL: int = 512
+N_LAYERS: int = 6  # (N=6 in paper)
+N_HEADS: int = 8  # (h=8 in paper)
+# D_FF: int = 1024  # (d_ff = 4 * d_model = 1024)
+D_FF: int = 2048
+DROPOUT: float = 0.1  # (Dropout = 0.1 in paper)
+MAX_SEQ_LEN: int = 150  # (Max length for Positional Encoding)
+# Training Configuration
+# LEARNING_RATE: float = 1e-4
+LEARNING_RATE: float = 5e-4
+BATCH_SIZE: int = 32
+EPOCHS: int = 5
+# EPOCHS: int = 50
+# HuggingFace
+REPO_ID: str = "AlainDeLong/transformer-en-vi-base"
+FILENAME: str = "transformer_en_vi_iwslt_kaggle_1.safetensors"
+if __name__ == "__main__":
+    print(f"Using device: {DEVICE}")

src/dataset.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.utils.data import Dataset, DataLoader
+from datasets import Dataset as ArrowDataset
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+import config
+from src import utils
+class TranslationDataset(Dataset):
+    """
+    A "lazy" Dataset.
+    Uses the high-level PreTrainedTokenizerFast wrapper.
+    """
+    def __init__(
+        self,
+        dataset: ArrowDataset,
+        tokenizer: PreTrainedTokenizerFast,
+        max_len_src: int,
+        max_len_tgt: int,
+        src_lang: str = "en",
+        tgt_lang: str = "vi",
+    ):
+        super().__init__()
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.max_len_src = max_len_src
+        self.max_len_tgt = max_len_tgt
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def __getitem__(self, index: int) -> dict[str, list[int]]:
+        item = self.dataset[index]["translation"]
+        src_text = item[self.src_lang]
+        tgt_text = item[self.tgt_lang]
+        # We set add_special_tokens=False for manual control.
+        src_encoding = self.tokenizer(
+            src_text,
+            truncation=True,
+            max_length=self.max_len_src,
+            add_special_tokens=False,  # (Source has no SOS/EOS)
+        )
+        tgt_encoding = self.tokenizer(
+            tgt_text,
+            truncation=True,
+            max_length=self.max_len_tgt - 2,  # (Reserve 2 spots for SOS/EOS)
+            add_special_tokens=False,
+        )
+        # Manually add SOS/EOS to target
+        src_ids = src_encoding["input_ids"]
+        tgt_ids = (
+            [config.SOS_TOKEN_ID] + tgt_encoding["input_ids"] + [config.EOS_TOKEN_ID]
+        )
+        return {"src_ids": src_ids, "tgt_ids": tgt_ids}
+class DataCollator:
+    """
+    Implements a custom collate_fn.
+    1. Takes a list of dicts (from __getitem__)
+    2. Adds SOS/EOS (Wait, we did this in Dataset)
+    3. Creates decoder inputs and labels (shifted)
+    4. Dynamically pads all sequences *in the batch*
+    5. Creates all 3 required masks
+    6. Returns a single dict of tensors
+    """
+    def __init__(self, pad_token_id: int):
+        self.pad_token_id = pad_token_id
+    def __call__(self, batch: list[dict[str, list[int]]]) -> dict[str, Tensor]:
+        # 1. Get raw ID lists from the batch
+        src_ids_list = [item["src_ids"] for item in batch]
+        tgt_ids_list = [item["tgt_ids"] for item in batch]  # (Already has SOS/EOS)
+        # 2. Create shifted inputs/labels
+        # Decoder input (T_tgt): [SOS, w1, w2, w3]
+        dec_input_ids_list = [ids[:-1] for ids in tgt_ids_list]
+        # Label (T_tgt): [w1, w2, w3, EOS]
+        labels_list = [ids[1:] for ids in tgt_ids_list]
+        # 3. Dynamic Padding
+        # We use torch.nn.utils.rnn.pad_sequence
+        # (Note: batch_first=True means (B, T))
+        src_ids_padded = nn.utils.rnn.pad_sequence(
+            [torch.tensor(ids) for ids in src_ids_list],
+            batch_first=True,
+            padding_value=self.pad_token_id,
+        )
+        dec_input_ids_padded = nn.utils.rnn.pad_sequence(
+            [torch.tensor(ids) for ids in dec_input_ids_list],
+            batch_first=True,
+            padding_value=self.pad_token_id,
+        )
+        labels_padded = nn.utils.rnn.pad_sequence(
+            [torch.tensor(ids) for ids in labels_list],
+            batch_first=True,
+            padding_value=self.pad_token_id,  # (Loss will ignore this ID)
+        )
+        # 4. Get the sequence length
+        _, T_tgt = dec_input_ids_padded.shape
+        # 5. Create Masks (on CPU)
+        # (Mask 1) Source padding mask (for Encoder MHA & Cross-Attn)
+        # Shape: (B, 1, 1, T_src)
+        src_mask = utils.create_padding_mask(src_ids_padded, self.pad_token_id)
+        # (Mask 2) Target padding mask (for Decoder MHA)
+        # Shape: (B, 1, 1, T_tgt)
+        tgt_padding_mask = utils.create_padding_mask(
+            dec_input_ids_padded, self.pad_token_id
+        )
+        # (Mask 3) Target look-ahead mask (for Decoder MHA)
+        # Shape: (1, 1, T_tgt, T_tgt)
+        look_ahead_mask = utils.create_look_ahead_mask(T_tgt)
+        # (Mask 4) Combined target mask
+        # Shape: (B, 1, T_tgt, T_tgt)
+        tgt_mask = tgt_padding_mask & look_ahead_mask
+        return {
+            "src_ids": src_ids_padded,  # (B, T_src)
+            "tgt_input_ids": dec_input_ids_padded,  # (B, T_tgt)
+            "labels": labels_padded,  # (B, T_tgt)
+            "src_mask": src_mask,  # (B, 1, 1, T_src)
+            "tgt_mask": tgt_mask,  # (B, 1, T_tgt, T_tgt)
+        }
+def get_translation_datasets(
+    tokenizer: PreTrainedTokenizerFast,
+) -> tuple[TranslationDataset, TranslationDataset, TranslationDataset]:
+    """
+    A Factory function to automate the data pipeline setup.
+    It performs 3 steps:
+    1. Loads and cleans raw data (using src.utils).
+    2. Instantiates the TranslationDataset for Train, Val, and Test splits.
+    3. Returns the 3 PyTorch datasets ready for the DataLoader.
+    Args:
+        tokenizer: The trained tokenizer.
+    Returns:
+        Tuple containing (train_ds, val_ds, test_ds)
+    """
+    # 1. Load raw cleaned data (returns Dict[str, Dataset])
+    #    This keeps train.py clean from raw data handling logic.
+    train_data, val_data, test_data = utils.get_raw_data(
+        config.DATA_PATH, num_workers=config.NUM_WORKERS
+    )
+    train_data = train_data.select(range(config.NUM_SAMPLES_TO_USE))
+    print(f"Building PyTorch Datasets...")
+    # 2. Instantiate the Train Dataset
+    #    (Uses global config for max_length)
+    train_ds = TranslationDataset(
+        dataset=train_data,
+        tokenizer=tokenizer,
+        max_len_src=config.MAX_SEQ_LEN,
+        max_len_tgt=config.MAX_SEQ_LEN,
+    )
+    # 3. Instantiate the Validation Dataset
+    val_ds = TranslationDataset(
+        dataset=val_data,
+        tokenizer=tokenizer,
+        max_len_src=config.MAX_SEQ_LEN,
+        max_len_tgt=config.MAX_SEQ_LEN,
+    )
+    # 4. Instantiate the Test Dataset
+    test_ds = TranslationDataset(
+        dataset=test_data,
+        tokenizer=tokenizer,
+        max_len_src=config.MAX_SEQ_LEN,
+        max_len_tgt=config.MAX_SEQ_LEN,
+    )
+    print(
+        f"Datasets created: Train={len(train_ds)}, Val={len(val_ds)}, Test={len(test_ds)}"
+    )
+    return train_ds, val_ds, test_ds
+def get_dataloaders(
+    tokenizer: PreTrainedTokenizerFast,
+) -> tuple[DataLoader, DataLoader, DataLoader]:
+    """
+    A high-level Factory function to create DataLoaders.
+    This function abstracts away all the data pipeline complexity:
+    - Loading/Cleaning raw data
+    - Creating PyTorch Datasets
+    - Instantiating the DataCollator (dynamic padding)
+    - Creating DataLoaders with the correct batch size and workers
+    Args:
+        tokenizer: The trained tokenizer.
+    Returns:
+        Tuple containing (train_loader, val_loader, test_loader)
+    """
+    # 1. Create the Datasets (using the factory function we made earlier)
+    train_ds, val_ds, test_ds = get_translation_datasets(tokenizer)
+    # 2. Instantiate the Collator
+    # (We need config to get PAD_TOKEN_ID)
+    collator = DataCollator(pad_token_id=config.PAD_TOKEN_ID)
+    print(
+        f"Building DataLoaders (Batch Size: {config.BATCH_SIZE}, Workers: {config.NUM_WORKERS})..."
+    )
+    # 3. Create Train DataLoader
+    # (Shuffle = True is CRITICAL for training)
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=config.BATCH_SIZE,
+        shuffle=True,
+        num_workers=config.NUM_WORKERS,
+        collate_fn=collator,
+        pin_memory=True if config.DEVICE == "cuda" else False,  # (Optimization)
+        prefetch_factor=2,
+        persistent_workers=True,
+    )
+    # 4. Create Validation DataLoader
+    # (Shuffle = False for reproducible validation)
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=2 * config.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.NUM_WORKERS,
+        collate_fn=collator,
+        pin_memory=True if config.DEVICE == "cuda" else False,
+        prefetch_factor=2,
+        persistent_workers=True,
+    )
+    # 5. Create Test DataLoader
+    test_loader = DataLoader(
+        test_ds,
+        batch_size=2 * config.BATCH_SIZE,
+        shuffle=False,
+        num_workers=2,
+        # num_workers=config.NUM_WORKERS,
+        collate_fn=collator,
+        pin_memory=True if config.DEVICE == "cuda" else False,
+        prefetch_factor=2,
+    )
+    print(f"DataLoader (train) created with {len(train_loader)} batches.")
+    print(f"DataLoader (val) created with {len(val_loader)} batches.")
+    print(f"DataLoader (test) created with {len(test_loader)} batches.")
+    return train_loader, val_loader, test_loader

src/embedding.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+from jaxtyping import Int, Float
+import math
+class InputEmbeddings(nn.Module):
+    """
+    Implements the Input Embedding layer.
+    This module converts a tensor of token IDs into a tensor of
+    corresponding embedding vectors. It also scales the embeddings
+    by sqrt(d_model) as mentioned in the paper ("Attention Is All You Need",
+    Section 3.4).
+    """
+    def __init__(self, d_model: int, vocab_size: int) -> None:
+        """
+        Initializes the InputEmbedding layer.
+        Args:
+            d_model (int): The dimension of the embedding vector (D).
+            vocab_size (int): The size of the vocabulary.
+        """
+        super().__init__()
+        self.d_model: int = d_model
+        self.vocab_size: int = vocab_size
+        self.token_emb: nn.Embedding = nn.Embedding(vocab_size, d_model)
+    def forward(self, x: Int[Tensor, "B T"]) -> Float[Tensor, "B T D"]:
+        """
+        Forward pass for the InputEmbeddings.
+        Args:
+            x (Tensor): Input tensor of token IDs. Shape (B, T). B: batch_size, T: seq_len
+        Returns:
+            Tensor: The corresponding embedding vectors, scaled by sqrt(d_model).
+                    Shape (B, T, D).
+        """
+        # (B, T) -> (B, T, D)
+        embeddings = self.token_emb(x)
+        return embeddings * math.sqrt(self.d_model)
+class PositionalEncoding(nn.Module):
+    """
+    Implements the fixed (sin/cos) Positional Encoding module.
+    (Ref: "Attention Is All You Need", Section 3.5)
+    This module generates a tensor of positional encodings that are
+    added to the input embeddings. It also applies dropout to the
+    sum of the embeddings and the positional encodings.
+    """
+    def __init__(self, d_model: int, max_seq_len: int, dropout: float = 0.1) -> None:
+        """
+        Initializes the PositionalEncoding module.
+        Args:
+            d_model (int): The dimension of the model (D).
+            max_seq_len (int): The maximum sequence length (T_max) to pre-compute.
+            dropout (float): Dropout probability.
+        """
+        super().__init__()
+        self.dropout: nn.Dropout = nn.Dropout(p=dropout)
+        position: Tensor = torch.arange(max_seq_len).unsqueeze(1).float()
+        div_term: Tensor = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000) / d_model)
+        )
+        # (T_max, D)
+        pe: Tensor = torch.zeros(max_seq_len, d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        # (T_max D) -> (1, T_max, D)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+    def forward(self, x: Float[Tensor, "B T D"]) -> Float[Tensor, "B T D"]:
+        """
+        Adds positional encoding to the input embeddings and applies dropout.
+        Args:
+            x (Tensor): Input tensor (token embeddings, already scaled).
+                        Shape (B, T, D).
+        Returns:
+            Tensor: Output tensor with positional information and dropout.
+                    Shape (B, T, D).
+        """
+        x = x + self.pe[:, : x.size(1), :]
+        return self.dropout(x)

src/engine.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from torchmetrics.text import BLEUScore, SacreBLEUScore
+from tqdm.auto import tqdm
+import config
+from src import model, utils
+TGT_VOCAB_SIZE: int = config.VOCAB_SIZE
+def train_one_epoch(
+    model: model.Transformer,
+    dataloader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    criterion: nn.Module,
+    scheduler: torch.optim.lr_scheduler.LambdaLR,
+    device: torch.device,
+    logger=None,
+) -> float:
+    """
+    Runs a single training epoch.
+    Args:
+        model: The Transformer model.
+        dataloader: The training DataLoader.
+        optimizer: The optimizer.
+        criterion: The loss function (e.g., CrossEntropyLoss).
+        device: The device to run on (e.g., 'cuda').
+    Returns:
+        The average training loss for the epoch.
+    """
+    # Set model to training mode
+    # This enables dropout, etc.
+    model.train()
+    total_loss = 0.0
+    # Use tqdm for a progress bar
+    progress_bar = tqdm(dataloader, desc="Training", leave=False)
+    batch_idx: int = 0
+    for batch in progress_bar:
+        batch_idx += 1
+        # 1. Move batch to device (GPU)
+        # We define a helper for this
+        batch_gpu = {
+            k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)
+        }
+        # 2. Zero gradients before forward pass
+        optimizer.zero_grad()
+        # 3. Forward pass
+        # Get inputs for the model (as defined in Transformer.forward)
+        logits = model(
+            src=batch_gpu["src_ids"],
+            tgt=batch_gpu["tgt_input_ids"],
+            src_mask=batch_gpu["src_mask"],
+            tgt_mask=batch_gpu["tgt_mask"],
+        )  # Shape: (B, T_tgt, vocab_size)
+        # 4. Calculate loss
+        # CrossEntropyLoss expects (N, C) and (N,)
+        # We must reshape logits and labels
+        # Logits: (B, T_tgt, C) -> (B * T_tgt, C)
+        # Labels: (B, T_tgt) -> (B * T_tgt)
+        loss = criterion(logits.view(-1, TGT_VOCAB_SIZE), batch_gpu["labels"].view(-1))
+        # 5. Backward pass (compute gradients)
+        loss.backward()
+        # 6. Gradient Clipping (from paper)
+        # Helps prevent exploding gradients. '1.0' is a common value.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        # 7. Update weights
+        optimizer.step()
+        # 8. Update learning rate scheduler if used
+        scheduler.step()
+        # 9. Update stats
+        total_loss += loss.item()
+        progress_bar.set_postfix(loss=loss.item())
+        # 10. Log metrics
+        if logger and batch_idx % 100 == 0:
+            logger.log(
+                {
+                    "train/batch_loss": loss.item(),
+                    "train/learning_rate": optimizer.param_groups[0]["lr"],
+                }
+            )
+    # Return average loss for the epoch
+    return total_loss / len(dataloader)
+def validate_one_epoch(
+    model: model.Transformer,
+    dataloader: DataLoader,
+    criterion: nn.Module,
+    device: torch.device,
+) -> float:
+    """
+    Runs a single validation epoch.
+    Args:
+        model: The Transformer model.
+        dataloader: The validation DataLoader.
+        criterion: The loss function (e.g., CrossEntropyLoss).
+        device: The device to run on (e.g., 'cuda').
+    Returns:
+        The average validation loss for the epoch.
+    """
+    # Set model to evaluation mode
+    # This disables dropout.
+    model.eval()
+    total_loss = 0.0
+    # Use tqdm for a progress bar
+    progress_bar = tqdm(dataloader, desc="Validating", leave=False)
+    # Disable gradient computation
+    # This saves VRAM and speeds up inference.
+    with torch.no_grad():
+        for batch in progress_bar:
+            # 1. Move batch to device (GPU)
+            batch_gpu = {
+                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)
+            }
+            # 2. Forward pass
+            logits = model(
+                src=batch_gpu["src_ids"],
+                tgt=batch_gpu["tgt_input_ids"],
+                src_mask=batch_gpu["src_mask"],
+                tgt_mask=batch_gpu["tgt_mask"],
+            )  # Shape: (B, T_tgt, vocab_size)
+            # 3. Calculate loss
+            # (Use the same reshaping as in training for consistency)
+            loss = criterion(
+                logits.view(-1, TGT_VOCAB_SIZE), batch_gpu["labels"].view(-1)
+            )
+            # 4. Update stats
+            total_loss += loss.item()
+            progress_bar.set_postfix(loss=loss.item())
+    # Return average loss for the epoch
+    return total_loss / len(dataloader)
+def evaluate_model(
+    model: model.Transformer,
+    dataloader: DataLoader,
+    tokenizer: PreTrainedTokenizerFast,
+    device: torch.device,
+    table=None,
+) -> tuple[float, float]:
+    """
+    Runs final evaluation on the test set using Beam Search
+    and calculates the SacreBLEU score.
+    """
+    print("\n--- Starting Evaluation (BLEU + SacreBLEU) ---")
+    # Set model to evaluation mode
+    # This disables dropout.
+    model.eval()
+    all_predicted_strings = []
+    all_expected_strings = []
+    # --- No gradients needed ---
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc="Evaluating"):
+            batch_gpu = {
+                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)
+            }
+            src_ids = batch_gpu["src_ids"]
+            src_mask = batch_gpu["src_mask"]
+            expected_ids = batch_gpu["labels"]  # (B, T_tgt) [on GPU]
+            B = src_ids.size(0)
+            # --- Handle 2D Expected IDs) ---
+            batch_expected_strings = []
+            # Convert 2D GPU Tensor -> 2D CPU List
+            expected_id_lists = expected_ids.cpu().tolist()
+            # Now we iterate over the CPU list
+            for id_list in expected_id_lists:
+                # id_list is a 1D Python list (e.g., [70, 950, 7, 3])
+                # This call is now safe
+                token_list = tokenizer.convert_ids_to_tokens(id_list)
+                batch_expected_strings.append(
+                    utils.filter_and_detokenize(token_list, skip_special=True)
+                )
+            # --- Generate (decode) one sentence at a time ---
+            batch_predicted_strings = []
+            for i in tqdm(range(B), desc="Decoding Batch", leave=False):
+                src_sentence = src_ids[i].unsqueeze(0)
+                src_sentence_mask = src_mask[i].unsqueeze(0)
+                # (predicted_ids is 1D Tensor [T_out] on GPU)
+                predicted_ids = utils.greedy_decode_sentence(
+                    model,
+                    src_sentence,
+                    src_sentence_mask,
+                    max_len=config.MAX_SEQ_LEN,
+                    sos_token_id=config.SOS_TOKEN_ID,
+                    eos_token_id=config.EOS_TOKEN_ID,
+                    device=device,
+                )
+                # Convert 1D GPU Tensor -> 1D CPU List
+                predicted_id_list = predicted_ids.cpu().tolist()
+                # This call is now safe
+                predicted_token_list = tokenizer.convert_ids_to_tokens(
+                    predicted_id_list
+                )
+                decoded_str = utils.filter_and_detokenize(
+                    predicted_token_list, skip_special=True
+                )
+                batch_predicted_strings.append(decoded_str)
+            # --- Store strings for final metric calculation ---
+            all_predicted_strings.extend(batch_predicted_strings)
+            all_expected_strings.extend([[s] for s in batch_expected_strings])
+    bleu_metric = BLEUScore(n_gram=4, smooth=True).to(config.DEVICE)
+    sacrebleu_metric = SacreBLEUScore(
+        n_gram=4, smooth=True, tokenize="intl", lowercase=False
+    ).to(config.DEVICE)
+    # --- 5. Calculate final score ---
+    print("\nCalculating final BLEU score...")
+    final_bleu = bleu_metric(all_predicted_strings, all_expected_strings)
+    # print(f"\n========================================")
+    # print(f"🎉 FINAL BLEU SCORE (Evaluation Set): {final_bleu.item() * 100:.4f}%")
+    # print(f"========================================")
+    print("\nCalculating final SacreBLEU score...")
+    final_sacrebleu = sacrebleu_metric(all_predicted_strings, all_expected_strings)
+    # print(f"\n========================================")
+    # print(
+    #     f"🎉 FINAL SacreBLEU SCORE (Evaluation Set): {final_sacrebleu.item() * 100:.4f}%"
+    # )
+    # print(f"========================================")
+    # --- Show some examples ---
+    print("\n--- Translation Examples (Pred vs Exp) ---")
+    for i in range(min(5, len(all_predicted_strings))):
+        print(f"  PRED: {all_predicted_strings[i]}")
+        print(f"  EXP:  {all_expected_strings[i][0]}")
+        print("  ---")
+        table.add_data(all_expected_strings[i][0], all_predicted_strings[i])
+    return final_bleu.item() * 100, final_sacrebleu.item() * 100

src/layers.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from torch import Tensor
+import torch.nn as nn
+from jaxtyping import Bool, Float
+import math
+class MultiHeadAttention(nn.Module):
+    """
+    Terminology (jaxtyping):
+        B: batch_size
+        T_q: target sequence length (query)
+        T_k: source sequence length (key/value)
+        D: d_model (model dimension)
+        H: n_heads (number of heads)
+        d_k: dimension of each head (d_model / n_heads)
+    """
+    def __init__(self, d_model: int, n_heads: int) -> None:
+        super().__init__()
+        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
+        self.d_model: int = d_model
+        self.n_heads: int = n_heads
+        self.d_k: int = d_model // n_heads
+        self.w_q: nn.Linear = nn.Linear(d_model, d_model, bias=False)
+        self.w_k: nn.Linear = nn.Linear(d_model, d_model, bias=False)
+        self.w_v: nn.Linear = nn.Linear(d_model, d_model, bias=False)
+        self.w_o: nn.Linear = nn.Linear(d_model, d_model, bias=False)
+        self.attention_weights: Tensor | None = None
+    @staticmethod
+    def attention(
+        query: Float[Tensor, "B H T_q d_k"],
+        key: Float[Tensor, "B H T_k d_k"],
+        value: Float[Tensor, "B H T_k d_k"],
+        mask: Bool[Tensor, "... 1 T_q T_k"] | None,
+    ) -> tuple[Float[Tensor, "B H T_q d_k"], Float[Tensor, "B H T_q T_k"]]:
+        """
+        Static method for Scaled Dot-Product Attention calculation.
+        This is pure, stateless logic, making it easy to test.
+        (Ref: "Attention Is All You Need", Equation 1)
+        Args:
+            query (Tensor): Query tensor
+            key (Tensor): Key tensor
+            value (Tensor): Value tensor
+            mask (Tensor | None): Optional mask (for padding or look-ahead).
+        Returns:
+            tuple[Tensor, Tensor]:
+                - context_vector: The output of the attention mechanism.
+                - attention_weights: The softmax-normalized attention weights.
+        """
+        d_k: int = query.shape[-1]
+        # (B, H, T_q, d_k) @ (B, H, d_k, T_k) -> (B, H, T_q, T_k)
+        attention_scores: Tensor = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
+        if mask is not None:
+            attention_scores = attention_scores.masked_fill(
+                mask == 0, value=float("-inf")
+            )
+        attention_weights: Tensor = attention_scores.softmax(dim=-1)
+        # (B, H, T_q, T_k) @ (B, H, T_k, d_k) -> (B, H, T_q, d_k)
+        context_vector: Tensor = attention_weights @ value
+        return context_vector, attention_weights
+    def forward(
+        self,
+        q: Float[Tensor, "B T_q D"],
+        k: Float[Tensor, "B T_k D"],
+        v: Float[Tensor, "B T_k D"],
+        mask: Bool[Tensor, "... 1 T_q T_k"] | None = None,  # Optional mask
+    ) -> Float[Tensor, "B T_q D"]:
+        """
+        Forward pass for Multi-Head Attention.
+        In Self-Attention (Encoder), q, k, and v are all the same tensor.
+        In Cross-Attention (Decoder), q comes from the Decoder, while k and v
+        come from the Encoder's output.
+        Args:
+            q: Query tensor
+            k: Key tensor
+            v: Value tensor
+            mask: Optional mask to apply (padding or look-ahead)
+        Returns:
+            The context vector after multi-head attention and output projection.
+        """
+        B, T_q, _ = q.shape
+        _, T_k, _ = k.shape  # T_k == T_v
+        # (B, T, D) -> (B, T, D)
+        Q: Tensor = self.w_q(q)
+        K: Tensor = self.w_k(k)
+        V: Tensor = self.w_v(v)
+        # (B, T, D) -> (B, T, H, d_k) -> (B, H, T, d_k)
+        Q = Q.view(B, T_q, self.n_heads, self.d_k).transpose(1, 2)
+        K = K.view(B, T_k, self.n_heads, self.d_k).transpose(1, 2)
+        V = V.view(B, T_k, self.n_heads, self.d_k).transpose(1, 2)
+        context_vector, self.attention_weights = self.attention(Q, K, V, mask)
+        # (B, H, T_q, d_k) -> (B, T_q, H, d_k)
+        context_vector = context_vector.transpose(1, 2).contiguous()
+        # (B, T_q, H, d_k) -> (B, T_q, D)
+        context_vector = context_vector.view(B, T_q, self.d_model)
+        # (B, T_q, D) -> (B, T_q, D)
+        output: Tensor = self.w_o(context_vector)
+        return output
+class PositionwiseFeedForward(nn.Module):
+    """
+    Implements the Position-wise Feed-Forward Network (FFN) sublayer.
+    (Ref: "Attention Is All You Need", Section 3.3)
+    This is a two-layer MLP (Multi-Layer Perceptron) applied independently
+    to each position in the sequence.
+    FFN(x) = max(0, x * W_1 + b_1) * W_2 + b_2
+    (Or using ReLU activation)
+    Terminology (jaxtyping):
+    B: batch_size
+    T: seq_len (context_length)
+    D: d_model (model dimension)
+    D_FF: d_ff (inner feed-forward dimension)
+    """
+    def __init__(self, d_model: int, d_ff: int) -> None:
+        """
+        Initializes the FFN.
+        Args:
+            d_model (int): Dimension of the model (e.g., 512).
+            d_ff (int): Inner dimension of the FFN (e.g., 2048).
+                        Paper suggests d_ff = 4 * d_model.
+            dropout (float): Dropout probability (applied *before* the
+                             second linear layer in some implementations,
+                             or as part of ResidualConnection).
+        """
+        super().__init__()
+        # (B, T, D) -> (B, T, D_FF)
+        self.linear_1: nn.Linear = nn.Linear(d_model, d_ff)
+        self.activation: nn.ReLU = nn.ReLU()
+        # (B, T, D_FF) -> (B, T, D)
+        self.linear_2: nn.Linear = nn.Linear(d_ff, d_model)
+    def forward(self, x: Float[Tensor, "B T D"]) -> Float[Tensor, "B T D"]:
+        """
+        Forward pass for the FFN.
+        Applies two linear transformations with a ReLU activation in between.
+        Args:
+            x: Input tensor from the previous sublayer
+               (e.g., MultiHeadAttention output).
+        Returns:
+            Output tensor of the same shape.
+        """
+        # (B, T, D) -> (B, T, D_FF)
+        x = self.linear_1(x)
+        # (B, T, D_FF) -> (B, T, D_FF)
+        x = self.activation(x)
+        # (B, T, D_FF) -> (B, T, D)
+        x = self.linear_2(x)
+        return x

src/model.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+from safetensors.torch import load_model
+from jaxtyping import Bool, Int, Float
+from huggingface_hub import hf_hub_download
+from embedding import InputEmbeddings, PositionalEncoding
+from modules import Encoder, Decoder
+import config
+class Generator(nn.Module):
+    """
+    Implements the final Linear (Projection) layer and Softmax.
+    This module takes the final output of the Decoder stack (B, T, D)
+    and projects it onto the vocabulary space (B, T, vocab_size)
+    to produce the logits.
+    (This layer's weights can be tied with the
+    target embedding layer, which we will handle in the main
+    'Transformer' model class).
+    """
+    def __init__(self, d_model: int, vocab_size: int) -> None:
+        """
+        Initializes the Generator (Output Projection) layer.
+        Args:
+            d_model (int): The dimension of the model (D).
+            vocab_size (int): The size of the target vocabulary.
+        """
+        super().__init__()
+        self.proj: nn.Linear = nn.Linear(d_model, vocab_size, bias=False)
+    def forward(
+        self, x: Float[Tensor, "B T_tgt D"]
+    ) -> Float[Tensor, "B T_tgt vocab_size"]:
+        """
+        Forward pass for the Generator.
+        Args:
+            x (Tensor): The final output tensor from the Decoder stack.
+        Returns:
+            Tensor: The output logits over the vocabulary.
+        """
+        # (B, T_tgt, D) -> (B, T_tgt, vocab_size)
+        logits = self.proj(x)
+        return logits
+class Transformer(nn.Module):
+    """
+    The main Transformer model architecture, combining the Encoder
+    and Decoder stacks, as described in "Attention Is All You Need".
+    This implementation follows modern best practices (Pre-LN) and
+    is designed for a sequence-to-sequence task (e.g., translation).
+    """
+    def __init__(
+        self,
+        src_vocab_size: int,
+        tgt_vocab_size: int,
+        d_model: int,
+        n_heads: int,
+        n_layers: int,  # N=6 in the paper
+        d_ff: int,
+        dropout: float = 0.1,
+        max_seq_len: int = 512,  # Max length for positional encoding
+    ) -> None:
+        """
+        Initializes the full Transformer model.
+        Args:
+            src_vocab_size (int): Vocabulary size for the source language.
+            tgt_vocab_size (int): Vocabulary size for the target language.
+            d_model (int): The dimension of the model (D).
+            n_heads (int): The number of attention heads (H).
+            n_layers (int): The number of Encoder/Decoder layers (N).
+            d_ff (int): The inner dimension of the Feed-Forward Network (D_FF).
+            dropout (float): The dropout rate.
+            max_seq_len (int): The maximum sequence length for positional encoding.
+        """
+        super().__init__()
+        self.d_model = d_model
+        # --- 1. Source (Encoder) Embeddings ---
+        # We create two separate embedding layers
+        self.src_embed: InputEmbeddings = InputEmbeddings(d_model, src_vocab_size)
+        # --- 2. Target (Decoder) Embeddings ---
+        self.tgt_embed: InputEmbeddings = InputEmbeddings(d_model, tgt_vocab_size)
+        # --- 3. Positional Encoding ---
+        # We use "one" PositionalEncoding module
+        # and share it for both source and target.
+        self.pos_enc: PositionalEncoding = PositionalEncoding(
+            d_model, max_seq_len, dropout
+        )
+        # --- 4. Encoder Stack ---
+        self.encoder: Encoder = Encoder(d_model, n_heads, d_ff, n_layers, dropout)
+        # --- 5. Decoder Stack ---
+        self.decoder: Decoder = Decoder(d_model, n_heads, d_ff, n_layers, dropout)
+        # --- 6. Final Output Projection (Generator) ---
+        self.generator: Generator = Generator(d_model, tgt_vocab_size)
+        # --- Weight Typing ---
+        # We tie the weights of the target embedding and the generator.
+        # This saves parameters and improves performance.
+        self.generator.proj.weight = self.tgt_embed.token_emb.weight
+        # --- Initialize weights ---
+        # This is crucial for stable training.
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module):
+        """
+        Applies Xavier/Glorot uniform initialization to linear layers.
+        This is a common and effective initialization strategy.
+        """
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.Embedding):
+            # Initialize embeddings (e.g., from a normal distribution)
+            nn.init.normal_(module.weight, mean=0, std=self.d_model**-0.5)
+    def forward(
+        self,
+        src: Int[Tensor, "B T_src"],  # Source token IDs (e.g., English)
+        tgt: Int[Tensor, "B T_tgt"],  # Target token IDs (e.g., Vietnamese)
+        src_mask: Bool[Tensor, "B 1 1 T_src"],  # Source padding mask
+        tgt_mask: Bool[Tensor, "B 1 T_tgt T_tgt"],  # Target combined mask
+    ) -> Float[Tensor, "B T_tgt vocab_size"]:
+        """
+        Defines the main forward pass of the Transformer model.
+        Args:
+            src (Tensor): Source sequence token IDs.
+            tgt (Tensor): Target sequence token IDs (shifted right).
+            src_mask (Tensor): Padding mask for the source sequence.
+            tgt_mask (Tensor): Combined padding and look-ahead mask
+                               for the target sequence.
+        Returns:
+            Tensor: The output logits from the model (B, T_tgt, vocab_size).
+        """
+        # 1. Encode the source sequence
+        # (B, T_src) -> (B, T_scr, D)
+        src_embeded = self.src_embed(src)
+        src_with_pos = self.pos_enc(src_embeded)
+        # (B, T_src, D) -> (B, T_src, D)
+        # This 'memory' will be used by every DecoderLayer
+        enc_output: Tensor = self.encoder(src_with_pos, src_mask)
+        # 2. Decode the target sequence
+        # (B, T_tgt) -> (B, T_tgt, D)
+        tgt_embeded = self.tgt_embed(tgt)
+        tgt_with_pos = self.pos_enc(tgt_embeded)
+        # (B, T_tgt, D) -> (B, T_tgt, D)
+        dec_output: Tensor = self.decoder(tgt_with_pos, enc_output, src_mask, tgt_mask)
+        # 3. Generate final logits
+        # (B, T_tgt, D) -> (B, T_tgt, vocab_size)
+        logits: Tensor = self.generator(dec_output)
+        return logits
+def load_trained_model(
+    config_obj, checkpoint_path, device: torch.device
+) -> Transformer:
+    print("Downloading safetensors from Hub...")
+    model_path = hf_hub_download(repo_id=config.REPO_ID, filename=config.FILENAME)
+    print("Instantiating the Transformer model...")
+    model = Transformer(
+        src_vocab_size=config_obj.VOCAB_SIZE,
+        tgt_vocab_size=config_obj.VOCAB_SIZE,
+        d_model=config_obj.D_MODEL,
+        n_heads=config_obj.N_HEADS,
+        n_layers=config_obj.N_LAYERS,
+        d_ff=config_obj.D_FF,
+        dropout=config_obj.DROPOUT,
+        max_seq_len=config_obj.MAX_SEQ_LEN,
+    ).to(device)
+    # print(f"Loading model from: {checkpoint_path}")
+    # load_model(model, filename=checkpoint_path)
+    print(f"Loading model from: {model_path}")
+    load_model(model, filename=model_path)
+    print(f"Successfully loaded trained weights from {model_path}")
+    return model

src/modules.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from torch import Tensor
+import torch.nn as nn
+from typing import Callable
+from jaxtyping import Bool, Float
+from layers import MultiHeadAttention, PositionwiseFeedForward
+class ResidualConnection(nn.Module):
+    """
+    Implements the (Pre-LN) Residual Connection module, which wraps a sublayer
+    (like MultiHeadAttention or FFN) with LayerNormalization and Dropout.
+    This is the modern "best practice" used in models like GPT-2, which is
+    more stable than the original Post-LN design in "Attention Is All You Need".
+    Architecture: x = x + Dropout(Sublayer(LayerNorm(x)))
+    """
+    def __init__(self, d_model: int, dropout: float = 0.1) -> None:
+        """
+        Initializes the Residual Connection.
+        Args:
+            d_model (int): The dimension of the model (D).
+            dropout (float): Dropout probability to apply to the sublayer output.
+        """
+        super().__init__()
+        self.dropout: nn.Dropout = nn.Dropout(dropout)
+        self.norm: nn.LayerNorm = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        x: Float[Tensor, "B T D"],
+        sublayer: Callable[[Float[Tensor, "B T D"]], Float[Tensor, "B T D"]],
+    ) -> Float[Tensor, "B T D"]:
+        """
+        Forward pass for the Residual Connection.
+        Args:
+            x (Tensor): The input tensor from the previous layer.
+            sublayer (Callable): The sublayer module (e.g., MHA or FFN)
+                                 to apply the connection to.
+        Returns:
+            Tensor: The output tensor after the residual connection.
+        """
+        x_normed = self.norm(x)
+        sublayer_output = sublayer(x_normed)
+        dropout_output = self.dropout(sublayer_output)
+        return x + dropout_output
+class EncoderLayer(nn.Module):
+    """
+    Implements one single Encoder Layer (or "Block") of the Transformer Encoder.
+    An Encoder Layer consists of two main sublayers:
+    1. A Multi-Head Self-Attention mechanism (MHA).
+    2. A Position-wise Feed-Forward Network (FFN).
+    Each sublayer is wrapped by a ResidualConnection (which includes
+    Pre-LayerNormalization and Dropout).
+    Architecture:
+    x -> Residual_1(x, MHA) -> x'
+    x' -> Residual_2(x', FFN) -> output
+    """
+    def __init__(
+        self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1
+    ) -> None:
+        """
+        Initializes the Encoder Layer.
+        Args:
+            d_model (int): The dimension of the model (D).
+            n_heads (int): The number of attention heads (H).
+            d_ff (int): The inner dimension of the Feed-Forward Network (D_FF).
+            dropout (float): The dropout rate for the residual connections.
+        """
+        super().__init__()
+        self.self_attn: MultiHeadAttention = MultiHeadAttention(d_model, n_heads)
+        self.feed_forward: PositionwiseFeedForward = PositionwiseFeedForward(
+            d_model, d_ff
+        )
+        self.residual_1: ResidualConnection = ResidualConnection(d_model, dropout)
+        self.residual_2: ResidualConnection = ResidualConnection(d_model, dropout)
+    def forward(
+        self, x: Float[Tensor, "B T D"], src_mask: Bool[Tensor, "B 1 1 T_k"]
+    ) -> Float[Tensor, "B T D"]:
+        """
+        Forward pass for the Encoder Layer.
+        Args:
+            x (Tensor): Input tensor from the previous layer or embedding.
+            src_mask (Tensor): The padding mask for the source sentence.
+                               Shape (B, 1, 1, T_k) allows broadcasting
+                               to (B, H, T_q, T_k).
+        Returns:
+            Tensor: The output tensor of the Encoder Layer.
+        """
+        x = self.residual_1(
+            x,
+            lambda x_normed: self.self_attn(
+                q=x_normed, k=x_normed, v=x_normed, mask=src_mask
+            ),
+        )
+        x = self.residual_2(x, self.feed_forward)
+        return x
+class Encoder(nn.Module):
+    """
+    Implements the full Transformer Encoder, which is a stack of N
+    identical EncoderLayers.
+    This module takes the input embeddings + positional encodings and
+    processes them through N layers of self-attention and FFNs.
+    (Best Practice: Uses Pre-LN, so a final LayerNorm is applied
+    at the *end* of the stack, before passing to the Decoder).
+    """
+    def __init__(
+        self, d_model: int, n_heads: int, d_ff: int, n_layers: int, dropout: float = 0.1
+    ) -> None:
+        """
+        Initializes the Encoder stack.
+        Args:
+            d_model (int): The dimension of the model (D).
+            n_heads (int): The number of attention heads (H).
+            d_ff (int): The inner dimension of the Feed-Forward Network (D_FF).
+            n_layers (int): The number of EncoderLayer blocks to stack (N).
+            dropout (float): The dropout rate for the residual connections.
+        """
+        super().__init__()
+        self.layers: nn.ModuleList = nn.ModuleList(
+            [EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
+        )
+        self.norm: nn.LayerNorm = nn.LayerNorm(d_model)
+    def forward(
+        self, x: Float[Tensor, "B T D"], src_mask: Bool[Tensor, "B 1 1 T"]
+    ) -> Float[Tensor, "B T D"]:
+        """
+        Forward pass for the entire Encoder stack.
+        Args:
+            x (Tensor): Input tensor (usually token embeddings + pos encodings).
+            src_mask (Tensor): The padding mask for the source sentence.
+        Returns:
+            Tensor: The output of the final Encoder layer (the "context"
+                    or "memory" for the Decoder).
+        """
+        for layer in self.layers:
+            x = layer(x, src_mask)
+        x = self.norm(x)
+        return x
+class DecoderLayer(nn.Module):
+    """
+    Implements one single Decoder Layer (or "Block") of the Transformer Decoder.
+    A Decoder Layer consists of three main sublayers:
+    1. A Masked Multi-Head Self-Attention mechanism (MHA).
+    2. A Multi-Head Cross-Attention mechanism (MHA).
+    3. A Position-wise Feed-Forward Network (FFN).
+    Each sublayer is wrapped by a ResidualConnection (Pre-LN and Dropout).
+    Architecture:
+    x -> Residual_1(x, Masked_MHA) -> x'
+    x' -> Residual_2(x', Cross_MHA, enc_output) -> x''
+    x'' -> Residual_3(x'', FFN) -> output
+    """
+    def __init__(
+        self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1
+    ) -> None:
+        """
+        Initializes the Decoder Layer.
+        Args:
+            d_model (int): The dimension of the model (D).
+            n_heads (int): The number of attention heads (H).
+            d_ff (int): The inner dimension of the Feed-Forward Network (D_FF).
+            dropout (float): The dropout rate for the residual connections.
+        """
+        super().__init__()
+        self.self_attn: MultiHeadAttention = MultiHeadAttention(d_model, n_heads)
+        self.cross_attn: MultiHeadAttention = MultiHeadAttention(d_model, n_heads)
+        self.feed_forward: PositionwiseFeedForward = PositionwiseFeedForward(
+            d_model, d_ff
+        )
+        self.residual_1: ResidualConnection = ResidualConnection(d_model, dropout)
+        self.residual_2: ResidualConnection = ResidualConnection(d_model, dropout)
+        self.residual_3: ResidualConnection = ResidualConnection(d_model, dropout)
+    def forward(
+        self,
+        x: Float[Tensor, "B T_tgt D"],
+        enc_output: Float[Tensor, "B T_src D"],
+        src_mask: Bool[Tensor, "B 1 1 T_src"],
+        tgt_mask: Bool[Tensor, "B 1 1 T_tgt"],
+    ) -> Float[Tensor, "B T_tgt D"]:
+        """
+        Forward pass for the Decoder Layer.
+        Args:
+            x (Tensor): Input tensor from the previous decoder layer.
+            enc_output (Tensor): The output tensor from the Encoder (K, V).
+            src_mask (Tensor): The padding mask for the source (Encoder) input.
+            tgt_mask (Tensor): The combined look-ahead and padding mask
+                               for the target (Decoder) input.
+        Returns:
+            Tensor: The output tensor of the Decoder Layer.
+        """
+        x = self.residual_1(
+            x,
+            lambda x_normed: self.self_attn(
+                q=x_normed, k=x_normed, v=x_normed, mask=tgt_mask
+            ),
+        )
+        x = self.residual_2(
+            x,
+            lambda x_normed: self.cross_attn(
+                q=x_normed, k=enc_output, v=enc_output, mask=src_mask
+            ),
+        )
+        x = self.residual_3(x, self.feed_forward)
+        return x
+class Decoder(nn.Module):
+    """
+    Implements the full Transformer Decoder, which is a stack of N
+    identical DecoderLayers.
+    This module takes the target embeddings + positional encodings and
+    processes them through N layers of masked self-attention,
+    cross-attention, and FFNs.
+    (Best Practice: Uses Pre-LN, so a final LayerNorm is applied
+    at the *end* of the stack, before passing to the final Generator).
+    """
+    def __init__(
+        self, d_model: int, n_heads: int, d_ff: int, n_layers: int, dropout: float = 0.1
+    ) -> None:
+        """
+        Initializes the Decoder stack.
+        Args:
+            d_model (int): The dimension of the model (D).
+            n_heads (int): The number of attention heads (H).
+            d_ff (int): The inner dimension of the Feed-Forward Network (D_FF).
+            n_layers (int): The number of DecoderLayer blocks to stack (N).
+            dropout (float): The dropout rate for the residual connections.
+        """
+        super().__init__()
+        self.layers: nn.ModuleList = nn.ModuleList(
+            [DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
+        )
+        self.norm: nn.LayerNorm = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        x: Float[Tensor, "B T_tgt D"],
+        enc_output: Float[Tensor, "B T_src D"],
+        src_mask: Bool[Tensor, "B 1 1 T_src"],
+        tgt_mask: Bool[Tensor, "1 1 T_tgt T_tgt"],
+    ) -> Float[Tensor, "B T_tgt D"]:
+        """
+        Forward pass for the entire Decoder stack.
+        Args:
+            x (Tensor): Input tensor for the target (embeddings + pos enc).
+            enc_output (Tensor): The output from the Encoder (K, V for cross-attn).
+            src_mask (Tensor): Padding mask for the source (Encoder) sequence.
+            tgt_mask (Tensor): Combined mask for the target (Decoder) sequence.
+        Returns:
+            Tensor: The output of the final Decoder layer, ready for the
+                    final projection (Generator).
+        """
+        for layer in self.layers:
+            x = layer(x, enc_output, src_mask, tgt_mask)
+        x = self.norm(x)
+        return x

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,178 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import time
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from huggingface_hub import hf_hub_download
+import config
+import model
+import utils
+# ==========================================
+# 1. ASSUMPTIONS
+# ==========================================
+@st.cache_resource
+def load_artifacts():
+    tokenizer: PreTrainedTokenizerFast = None
+    transformer_model: model.Transformer = None
+    try:
+        tok_path = hf_hub_download(
+            repo_id=config.REPO_ID, filename="iwslt_en-vi_tokenizer_32k.json"
+        )
+        tokenizer = utils.load_tokenizer(tok_path)
+        print("Loading model for inference...")
+        transformer_model = model.load_trained_model(
+            config, config.MODEL_SAVE_PATH, config.DEVICE
+        )
+    except Exception as e:
+        print(
+            f"Warning: Could not load model. Using RANDOMLY initialized model. Error: {e}"
+        )
+        print("   (Translations will be gibberish)")
+    return transformer_model, tokenizer
+# ==========================================
+# 2. UI CONFIGURATION
+# ==========================================
+st.set_page_config(
+    page_title="En-Vi Translator | AttentionIsAllYouBuild",
+    page_icon="🤖",
+    layout="centered",
+    # layout="wide",
+)
+# Customize CSS to create beautiful interface
+st.markdown(
+    """
+<style>
+    .main {
+        background-color: #f5f5f5;
+    }
+    .stTextArea textarea {
+        font-size: 16px;
+    }
+    .stButton button {
+        width: 100%;
+        background-color: #FF4B4B;
+        color: white;
+        font-weight: bold;
+        padding: 10px;
+    }
+    .result-box {
+        background-color: #ffffff;
+        padding: 20px;
+        border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+        border-left: 5px solid #FF4B4B;
+    }
+    .source-text {
+        color: #666;
+        font-style: italic;
+        font-size: 14px;
+        margin-bottom: 5px;
+    }
+    .translated-text {
+        color: #333;
+        font-size: 20px;
+        font-weight: 600;
+    }
+</style>
+""",
+    unsafe_allow_html=True,
+)
+# ==========================================
+# 3. MAIN APP LAYOUT
+# ==========================================
+# Header
+st.title("🤖 AI Translator: English → Vietnamese")
+st.markdown("### Project: *Attention Is All You Build*")
+st.markdown("---")
+# Sidebar (Thông tin thêm)
+with st.sidebar:
+    st.header("ℹ️ Thông tin Model")
+    st.info(
+        """
+        Đây là mô hình **Transformer (Encoder-Decoder)** được xây dựng "from scratch" bằng PyTorch.
+        - **Kiến trúc**: Pre-LN Transformer
+        - **Tokenizer**: BPE (32k vocab)
+        - **Inference**: Greedy
+        """
+    )
+    st.write("Created by [Your Name]")
+# Input Area
+input_text = st.text_area(
+    label="Nhập câu tiếng Anh:",
+    placeholder="Example: Artificial intelligence is transforming the world...",
+    height=150,
+)
+# ==========================================
+# 4. INFERENCE LOGIC
+# ==========================================
+# Nút bấm Dịch
+if st.button("Dịch sang Tiếng Việt (Translate)"):
+    if not input_text.strip():
+        st.warning("⚠️ Vui lòng nhập nội dung cần dịch!")
+    else:
+        # Hiển thị spinner trong khi model chạy
+        # Display spinner while model is running
+        with st.spinner("Wait a second... AI is thinking 🧠"):
+            try:
+                # Đo thời gian inference
+                start_time = time.time()
+                # --- Call translate function ---
+                transformer_model, tokenizer = load_artifacts()
+                if utils and transformer_model and tokenizer:
+                    translation = utils.translate(
+                        transformer_model,
+                        tokenizer,
+                        sentence_en=input_text,
+                        device=config.DEVICE,
+                        max_len=config.MAX_SEQ_LEN,
+                        sos_token_id=config.SOS_TOKEN_ID,
+                        eos_token_id=config.EOS_TOKEN_ID,
+                        pad_token_id=config.PAD_TOKEN_ID,
+                    )
+                else:
+                    # Mockup output
+                    time.sleep(1)  # Simulate latency
+                    translation = "[DEMO OUTPUT] Hệ thống chưa load model thực tế. Đây là kết quả mẫu."
+                end_time = time.time()
+                inference_time = end_time - start_time
+                # --- Display Result ---
+                st.success(f"✅ Hoàn tất trong {inference_time:.2f}s")
+                st.markdown("### Kết quả:")
+                st.markdown(
+                    f"""
+                    <div class="result-box">
+                        <div class="source-text">Original: {input_text}</div>
+                        <div class="translated-text">{translation}</div>
+                    </div>
+                    """,
+                    unsafe_allow_html=True,
+                )
+            except Exception as e:
+                st.error(f"❌ Đã xảy ra lỗi trong quá trình dịch: {str(e)}")
+# Footer
+st.markdown("---")
+st.caption("Powered by PyTorch & Streamlit")

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from pathlib import Path
+from datasets import Dataset
+from tokenizers import (
+    Tokenizer,
+    models,
+    normalizers,
+    pre_tokenizers,
+    decoders,
+    trainers,
+)
+from tqdm.auto import tqdm
+import wandb
+from utils import get_raw_data
+DATA_PATH = Path(r"..\data\IWSLT-15-en-vi")
+# TOKENIZER_NAME = "iwslt_en-vi_tokenizer_16k.json"
+TOKENIZER_NAME = "iwslt_en-vi_tokenizer_32k.json"
+TOKENIZER_SAVE_PATH = Path(r"..\artifacts\tokenizers") / TOKENIZER_NAME
+# VOCAB_SIZE: int = 16_000
+VOCAB_SIZE: int = 32_000
+SPECIAL_TOKENS: list[str] = ["[PAD]", "[UNK]", "[SOS]", "[EOS]"]
+BATCH_SIZE_FOR_TOKENIZER: int = 10000
+NUM_WORKERS: int = 8
+def get_training_corpus(dataset: Dataset, batch_size: int = 1000):
+    """
+    A generator function to yield batches of text.
+    This implementation uses dataset.iter(batch_size=...), which is the
+    highly optimized, zero-copy Arrow iterator.
+    We then use list comprehensions to extract the 'en' and 'vi' strings
+    from the nested list of dictionaries returned by the iterator.
+    """
+    # We iterate over the dataset in batches
+    # batch will be: {'translation': [list of 1000 dicts]}
+    for batch in dataset.iter(batch_size=batch_size):
+        # We must iterate through the list 'batch['translation']'
+        # to extract the individual strings.
+        # This list comprehension is fast and Pythonic.
+        en_strings: list[str] = [item["en"] for item in batch["translation"]]
+        vi_strings: list[str] = [item["vi"] for item in batch["translation"]]
+        # Yield the batch of strings (which the trainer expects)
+        yield en_strings
+        yield vi_strings
+def instantiate_tokenizer() -> Tokenizer:
+    # 1. Initialize an empty Tokenizer with a BPE model
+    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
+    # 2. Set up the normalizer and pre-tokenizer
+    # Normalizer: Cleans the text (e.g., Unicode, lowercase)
+    tokenizer.normalizer = normalizers.Sequence(
+        [
+            normalizers.NFKC(),  # Unicode normalization
+            normalizers.Lowercase(),  # Convert to lowercase
+        ]
+    )
+    # Pre-tokenizer: Splits text into "words" (e.g., by space, punctuation)
+    # BPE will then learn to merge sub-words from these.
+    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+    # Decoder: Reconstructs the string from tokens
+    tokenizer.decoder = decoders.BPEDecoder()
+    print("Tokenizer (empty) initialized.")
+    return tokenizer
+def train_tokenizer():
+    # Initialize the BpeTrainer
+    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)
+    print("Tokenizer Trainer initialized.")
+    train_dataset = get_raw_data(DATA_PATH, for_tokenizer=True)
+    if not isinstance(train_dataset, Dataset):
+        train_dataset = Dataset.from_list(train_dataset)
+    print(f"Starting tokenizer training on {len(train_dataset)} pairs...")
+    # 1. Define the iterator AND batch size
+    text_iterator = get_training_corpus(
+        train_dataset,
+        batch_size=BATCH_SIZE_FOR_TOKENIZER,
+    )
+    # 2. Calculate total steps for the progress bar
+    total_steps = (len(train_dataset) // BATCH_SIZE_FOR_TOKENIZER) * 2
+    if total_steps == 0:
+        total_steps = 1  # (Avoid division by zero if dataset is tiny)
+    tokenizer: Tokenizer = instantiate_tokenizer()
+    # 3. Train with tqdm progress bar
+    try:
+        tokenizer.train_from_iterator(
+            tqdm(
+                text_iterator,
+                total=total_steps,
+                desc="Training Tokenizer (IWSLT-Local)",
+            ),
+            trainer=trainer,
+            length=total_steps,
+        )
+    except KeyboardInterrupt:
+        print("\nTokenizer training interrupted by user.")
+    print("Tokenizer training complete.")
+    tokenizer.save(str(TOKENIZER_SAVE_PATH))
+    print(f"Tokenizer saved to: {TOKENIZER_SAVE_PATH}")
+    print(f"Total vocabulary size: {tokenizer.get_vocab_size()}")
+if __name__ == "__main__":
+    # dataset = get_raw_data()
+    # print(type(dataset))
+    # tokenizer: Tokenizer = instantiate_tokenizer()
+    # tokenizer.save(str(TOKENIZER_SAVE_PATH))
+    train_tokenizer()
+    run = wandb.init(
+        entity="alaindelong-hcmut",
+        project="Attention Is All You Build",
+        job_type="tokenizer-train",
+    )
+    # Log tokenizer
+    tokenizer_artifact = wandb.Artifact(
+        name="iwslt_en-vi_tokenizer",
+        type="tokenizer",
+        description="BPE Tokenizer trained on IWSLT 15 (133k+ pairs en-vi)",
+        metadata={
+            "vocab_size": 32000,
+            "algorithm": "BPE",
+            "framework": "huggingface",
+            "training_data": "iwslt-15-en-vi-133k",
+            "lower_case": False,
+        },
+    )
+    tokenizer_artifact.add_file(local_path=str(TOKENIZER_SAVE_PATH))
+    run.log_artifact(tokenizer_artifact, aliases=["baseline"])
+    run.finish()

src/utils.py ADDED Viewed

	@@ -0,0 +1,375 @@

+from pathlib import Path
+import random
+import re
+from datetime import datetime
+import numpy as np
+from datasets import DatasetDict, Dataset, load_dataset
+import torch
+from torch import Tensor
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from jaxtyping import Bool, Int
+# from src import model
+import model
+# Utility function to set random seed for reproducibility
+def seed_everything(seed: int = 42) -> None:
+    """
+    Set random seed for Python, NumPy, and PyTorch to ensure reproducibility.
+    Args:
+        seed (int): The seed value to use.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def make_run_name(model_name: str, d_model: int) -> str:
+    time_tag: str = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"{model_name}-{d_model}d-{time_tag}"
+# --- Helper functions for cleaning ---
+def is_valid_pair(example: dict) -> bool:
+    """Check if both 'en' and 'vi' strings are non-empty."""
+    translation = example.get("translation", {})
+    en_text = translation.get("en", "").strip()
+    vi_text = translation.get("vi", "").strip()
+    return bool(en_text) and bool(vi_text)  # (Return True if both are valid)
+def filter_empty(dataset: Dataset, num_proc: int) -> Dataset:
+    """
+    Applies the validation filter to a dataset split using
+    parallel processing (via .map() or .filter()).
+    """
+    print(f"  Filtering empty strings from split...")
+    # (We use .filter() which is highly optimized)
+    original_len = len(dataset)
+    filtered_dataset = dataset.filter(
+        is_valid_pair, num_proc=num_proc  # (Use parallel processing from config)
+    )
+    new_len = len(filtered_dataset)
+    print(f"  Filtered {original_len - new_len} empty/invalid pairs.")
+    return filtered_dataset
+# --- Dataset Loading & Splitting ---
+def get_raw_data(
+    dataset_path: str | Path, for_tokenizer: bool = False, num_workers: int = 8
+) -> Dataset | tuple[Dataset, Dataset, Dataset]:
+    """
+    Load and filter dataset splits from a given path.
+    Args:
+        dataset_path (str | Path): Path to the dataset directory or config.
+        for_tokenizer (bool): If True, return only filtered train split (for tokenizer training).
+                             If False, return tuple of (train, validation, test) splits (for model training/eval).
+        num_workers (int): Number of workers for parallel filtering.
+    Returns:
+        Dataset: Filtered train split (if for_tokenizer=True).
+        tuple(Dataset, Dataset, Dataset): Filtered train, validation, test splits (if for_tokenizer=False).
+    """
+    print(f"Loading datasets from: {dataset_path}")
+    all_splits: DatasetDict = load_dataset(path=str(dataset_path))
+    print(all_splits)
+    print("--- Filtering Datasets (Removing empty sentences) ---")
+    train_data: Dataset = filter_empty(all_splits["train"], num_workers)
+    val_data: Dataset = filter_empty(all_splits["validation"], num_workers)
+    test_data: Dataset = filter_empty(all_splits["test"], num_workers)
+    if for_tokenizer:
+        return train_data
+    else:
+        return train_data, val_data, test_data
+# Utility function to set random seed for reproducibility
+def load_tokenizer(tokenizer_path: str | Path) -> PreTrainedTokenizerFast:
+    """
+    Load a trained tokenizer from file and return tokenizer object and special token ids.
+    Args:
+        tokenizer_path (str | Path): Path to the tokenizer JSON file.
+        special_tokens (list[str], optional): List of special tokens to get ids for (e.g. ["[PAD]", "[SOS]", "[EOS]", "[UNK]"]).
+    Returns:
+        tokenizer (Tokenizer): Loaded tokenizer object.
+        token_ids (dict): Dictionary of special token ids.
+    """
+    print(f"Loading tokenizer from {tokenizer_path}...")
+    # tokenizer = Tokenizer.from_file(str(tokenizer_path))
+    tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path))
+    tokenizer.pad_token = "[PAD]"
+    tokenizer.unk_token = "[UNK]"
+    tokenizer.bos_token = "[SOS]"  # bos = Beginning Of Sentence
+    tokenizer.eos_token = "[EOS]"  # eos = End Of Sentence
+    return tokenizer
+def create_padding_mask(
+    input_ids: Int[Tensor, "B T_k"], pad_token_id: int
+) -> Bool[Tensor, "B 1 1 T_k"]:
+    """
+    Creates a padding mask for the attention mechanism.
+    This mask identifies positions holding the <PAD> token
+    and prepares a mask tensor that, when broadcasted, will mask
+    these positions in the attention scores matrix (B, H, T_q, T_k).
+    Args:
+        input_ids (Tensor): The input token IDs. Shape (B, T_k).
+        pad_token_id (int): The ID of the padding token.
+    Returns:
+        Tensor: A boolean mask of shape (B, 1, 1, T_k).
+                'True' means "keep" (not a pad token).
+                'False' means "mask out" (is a pad token).
+    """
+    # 1. Create the base mask
+    # (input_ids != pad_token_id) will be True for real tokens, False for PAD
+    # Shape: (B, T_k)
+    mask: Tensor = input_ids != pad_token_id
+    # 2. Add dimensions for broadcasting
+    # We add a dimension for T_q (dim 1) and H (dim 2)
+    # Shape: (B, T_k) -> (B, 1, T_k) -> (B, 1, 1, T_k)
+    return mask.unsqueeze(1).unsqueeze(2)
+def create_look_ahead_mask(seq_len: int) -> Bool[Tensor, "1 1 T_q T_q"]:
+    """
+    Creates a causal (look-ahead) mask for the Decoder's self-attention.
+    This mask prevents positions from attending to subsequent positions.
+    It's a square matrix where the upper triangle (future) is False
+    and the lower triangle (past/present) is True.
+    Args:
+        seq_len (int): The sequence length (T_q).
+        device (torch.device): The device to create the tensor on (e.g., 'cuda').
+    Returns:
+        Tensor: A boolean mask of shape (1, 1, T_q, T_q).
+                'True' means "keep" (allowed to see).
+                'False' means "mask out" (future token).
+    """
+    # 1. Create a square matrix of ones.
+    # Shape: (T_q, T_q)
+    ones = torch.ones(seq_len, seq_len)
+    # 2. Get the lower triangular part (bao gồm đường chéo)
+    # This sets the upper triangle (future) to 0 and keeps the rest 1.
+    # Shape: (T_q, T_q)
+    # Example (T_q=3):
+    # [[1., 0., 0.],
+    #  [1., 1., 0.],
+    #  [1., 1., 1.]]
+    lower_triangular: Tensor = torch.tril(ones)
+    # 3. Convert to boolean and add broadcasting dimensions
+    # Shape: (T_q, T_q) -> (1, 1, T_q, T_q)
+    # (mask == 1) converts 1. to True, 0. to False
+    return (lower_triangular == 1).unsqueeze(0).unsqueeze(0)
+def greedy_decode_sentence(
+    model: model.Transformer,
+    src: Int[Tensor, "1 T_src"],  # Input: one sentence
+    src_mask: Bool[Tensor, "1 1 1 T_src"],
+    max_len: int,
+    sos_token_id: int,
+    eos_token_id: int,
+    device: torch.device,
+) -> Int[Tensor, "1 T_out"]:
+    """
+    Performs greedy decoding for a single sentence.
+    This is an autoregressive process (token by token).
+    Args:
+        model: The trained Transformer model (already on device).
+        src: The source token IDs (e.g., English).
+        src_mask: The padding mask for the source.
+        max_len: The maximum length to generate.
+        sos_token_id: The ID for [SOS] token.
+        eos_token_id: The ID for [EOS] token.
+        device: The device to run on.
+    Returns:
+        Tensor: The generated target token IDs (e.g., Vietnamese).
+    """
+    # Set model to eval mode (disables dropout)
+    model.eval()
+    # No gradients needed
+    with torch.no_grad():
+        # --- 1. Encode the source *once* ---
+        # (B, T_src) -> (B, T_src, D)
+        src_embedded = model.src_embed(src)
+        src_with_pos = model.pos_enc(src_embedded)
+        enc_output: Tensor = model.encoder(src_with_pos, src_mask)
+        # --- 2. Initialize the Decoder input ---
+        # Start with the [SOS] token. Shape: (1, 1)
+        decoder_input: Tensor = torch.tensor(
+            [[sos_token_id]], dtype=torch.long, device=device
+        )  # Shape: (B=1, T_tgt=1)
+        # --- 3. Autoregressive Loop ---
+        for _ in range(max_len - 1):  # (Max length - 1, since we have [SOS])
+            # --- a. Get Target Embedding + Position ---
+            # (B, T_tgt) -> (B, T_tgt, D)
+            tgt_embedded = model.tgt_embed(decoder_input)
+            tgt_with_pos = model.pos_enc(tgt_embedded)
+            # --- b. Create Target Mask (Causal) ---
+            # We must re-create the mask every loop,
+            # as T_tgt (decoder_input.size(1)) is growing.
+            # Shape: (1, 1, T_tgt, T_tgt)
+            T_tgt = decoder_input.size(1)
+            tgt_mask = create_look_ahead_mask(T_tgt).to(device)
+            # --- c. Run Decoder and Generator ---
+            # (B, T_tgt, D)
+            dec_output: Tensor = model.decoder(
+                tgt_with_pos, enc_output, src_mask, tgt_mask
+            )
+            # (B, T_tgt, vocab_size)
+            logits: Tensor = model.generator(dec_output)
+            # --- d. Get the *last* token's logits ---
+            # (B, T_tgt, vocab_size) -> (B, vocab_size)
+            last_token_logits = logits[:, -1, :]
+            # --- e. Greedy Search (get highest prob. token) ---
+            # (B, vocab_size) -> (B, 1)
+            next_token: Tensor = torch.argmax(last_token_logits, dim=-1).unsqueeze(-1)
+            # --- f. Append the new token ---
+            # (B, T_tgt) + (B, 1) -> (B, T_tgt + 1)
+            decoder_input = torch.cat([decoder_input, next_token], dim=1)
+            # --- g. Check for [EOS] ---
+            # If the *last* token we added is [EOS], stop generating.
+            if next_token.item() == eos_token_id:
+                break
+        return decoder_input.squeeze(0)  # Return shape (T_out)
+def filter_and_detokenize(token_list: list[str], skip_special: bool = True) -> str:
+    """
+    Manually joins tokens with a space and cleans up common
+    punctuation issues caused by whitespace tokenization.
+    """
+    if skip_special:
+        # 1. Filter out special tokens
+        special_tokens = {"[PAD]", "[UNK]", "[SOS]", "[EOS]"}
+        token_list = [tok for tok in token_list if tok not in special_tokens]
+    # 2. Join with spaces
+    detokenized_string = " ".join(token_list)
+    # 3. Clean up punctuation
+    # (This is a simple heuristic-based detokenizer)
+    # Remove space before punctuation: "project ." -> "project."
+    detokenized_string = re.sub(r'\s([.,!?\'":;])', r"\1", detokenized_string)
+    # Handle contractions: "don 't" -> "don't"
+    detokenized_string = re.sub(r"(\w)\s(\'\w)", r"\1\2", detokenized_string)
+    return detokenized_string
+# Define a high-level, production-ready
+# inference function that handles all steps.
+def translate(
+    model: model.Transformer,
+    tokenizer: PreTrainedTokenizerFast,
+    sentence_en: str,
+    device: torch.device,
+    max_len: int,
+    sos_token_id: int,
+    eos_token_id: int,
+    pad_token_id: int,
+) -> str:
+    """
+    Translates a single English sentence to Vietnamese.
+    Args:
+        model: The trained Transformer model.
+        tokenizer: The (PreTrainedTokenizerFast) tokenizer.
+        sentence_en: The raw English input string.
+        device: The device to run on.
+        max_len: The max sequence length (from config).
+        sos_token_id: The ID for [SOS].
+        eos_token_id: The ID for [EOS].
+        pad_token_id: The ID for [PAD].
+    Returns:
+        str: The translated Vietnamese string.
+    """
+    # Set model to evaluation mode
+    model.eval()
+    # Run inference in a no-gradient context
+    with torch.no_grad():
+        # 1. Tokenize the source (English) sentence
+        src_encoding = tokenizer(
+            sentence_en,
+            truncation=True,
+            max_length=max_len,
+            add_special_tokens=False,  # (Encoder does not need SOS/EOS)
+        )
+        # 2. Convert to Tensor, add Batch dimension (B=1), and move to device
+        # Shape: (1, T_src)
+        src_ids: Tensor = torch.tensor(
+            [src_encoding["input_ids"]], dtype=torch.long
+        ).to(device)
+        # 3. Create the source padding mask
+        # Shape: (1, 1, 1, T_src)
+        src_mask: Tensor = create_padding_mask(src_ids, pad_token_id).to(device)
+        # 4. Generate the target (Vietnamese) token IDs
+        # (This calls the autoregressive function from Cell 16A)
+        # Shape: (T_out)
+        predicted_ids: Tensor = greedy_decode_sentence(
+            model,
+            src_ids,
+            src_mask,
+            max_len=max_len,
+            sos_token_id=sos_token_id,
+            eos_token_id=eos_token_id,
+            device=device,
+        )
+        # 5. Detokenize (Fixing "sticky" words)
+        # Convert 1D GPU Tensor -> 1D CPU List
+        predicted_id_list = predicted_ids.cpu().tolist()
+        # This call is safe (1D List -> List[str])
+        predicted_token_list = tokenizer.convert_ids_to_tokens(predicted_id_list)
+        # Use our helper (from Cell 16B) to
+        # join with spaces, remove special tokens, and fix punctuation.
+        result_string = filter_and_detokenize(predicted_token_list, skip_special=True)
+        return result_string
+    print("Inference function `translate()` defined.")