| # Hyperparameters and config settings | |
| EMBED_DIM = 256 # Size of token embeddings | |
| NUM_HEADS = 8 # Number of attention heads | |
| NUM_LAYERS = 4 # Number of transformer blocks | |
| FF_DIM = 512 # Feedforward layer dimension | |
| MAX_SEQ_LEN = 256 # Maximum sequence length | |
| VOCAB_SIZE = 100 # Placeholder (will be overridden based on dataset) | |
| ADAPTER_DIM = 32 # Add in adapter for continual learning | |