| from dataclasses import dataclass | |
| class Profile: | |
| # Transformer depth; shallow stacks limit overfitting on modest datasets | |
| # and keep inference latency low on edge hardware. | |
| num_layers: int | |
| # Hidden width sized to match domain-string complexity without paying for | |
| # unnecessary parameters. | |
| d_model: int | |
| # Multi-head attention count balances representational power against | |
| # compute footprint for GPU inference. | |
| nhead: int | |
| # Feed-forward expansion keeps the standard 4x ratio for strong accuracy | |
| # while maintaining throughput. | |
| ffn_mult: int = 4 | |
| # Dropout of 10% reduces co-adaptation; higher rates slowed convergence in | |
| # experiments. | |
| dropout: float = 0.1 | |
| # AdamW learning rate chosen from sweep; 3e-4 converged stably without | |
| # schedule tuning. | |
| lr: float = 3e-4 | |
| # Weight decay curbs weight drift and improves generalization when | |
| # training on long-running streams. | |
| weight_decay: float = 0.01 | |
| # Label smoothing discourages over-confident logits and improves | |
| # calibration on noisy threat labels. | |
| label_smoothing: float = 0.05 | |
| # Sequence length covers observed second-level domains with spare room for | |
| # rare longer names. | |
| max_len: int = 64 | |
| PROFILES = { | |
| # ~3.2M params - for quick testing | |
| "tiny": Profile(num_layers=4, d_model=256, nhead=8), | |
| # ~10.8M params - balanced performance | |
| "small": Profile(num_layers=6, d_model=384, nhead=6), | |
| } | |