from pathlib import Path import torch # Path Configuration DATA_PATH = Path(r"data\IWSLT-15-en-vi") # TOKENIZER_NAME = "" # TOKENIZER_NAME = "iwslt_en-vi_tokenizer_16k.json" TOKENIZER_NAME = "iwslt_en-vi_tokenizer_32k.json" TOKENIZER_PATH = Path(r"artifacts\tokenizers") / TOKENIZER_NAME MODEL_DIR = Path(r"artifacts\models") # MODEL_NAME = "" # MODEL_NAME = "transformer_en_vi_iwslt_1.pt" MODEL_NAME = "transformer_en_vi_iwslt_1.safetensors" # MODEL_SAVE_PATH = MODEL_DIR / MODEL_NAME MODEL_SAVE_PATH = MODEL_DIR / "transformer_en_vi_iwslt_kaggle_1.safetensors" # MODEL_SAVE_PATH = Path(r"notebooks\models") / MODEL_NAME CHECKPOINT_PATH = Path(r"artifacts\checkpoints") / MODEL_NAME CACHE_DIR = "" # Hardware & Data Config DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") NUM_WORKERS: int = 4 VOCAB_SIZE: int = 32_000 SPECIAL_TOKENS: list[str] = ["[PAD]", "[UNK]", "[SOS]", "[EOS]"] NUM_SAMPLES_TO_USE: int = 1000 # NUM_SAMPLES_TO_USE: int = 1_000_000 # Tokenizer Constants PAD_TOKEN_ID: int = 0 UNK_TOKEN_ID: int = 1 SOS_TOKEN_ID: int = 2 EOS_TOKEN_ID: int = 3 # Model Hyperparameters # D_MODEL: int = 256 # (Dimension of model) D_MODEL: int = 512 N_LAYERS: int = 6 # (N=6 in paper) N_HEADS: int = 8 # (h=8 in paper) # D_FF: int = 1024 # (d_ff = 4 * d_model = 1024) D_FF: int = 2048 DROPOUT: float = 0.1 # (Dropout = 0.1 in paper) MAX_SEQ_LEN: int = 150 # (Max length for Positional Encoding) # Training Configuration # LEARNING_RATE: float = 1e-4 LEARNING_RATE: float = 5e-4 BATCH_SIZE: int = 32 EPOCHS: int = 5 # EPOCHS: int = 50 # HuggingFace REPO_ID: str = "AlainDeLong/transformer-en-vi-base" FILENAME: str = "transformer_en_vi_iwslt_kaggle_1.safetensors" if __name__ == "__main__": print(f"Using device: {DEVICE}")