from dataclasses import dataclass


@dataclass(frozen=True, slots=True)
class Profile:
    # Transformer depth; shallow stacks limit overfitting on modest datasets
    # and keep inference latency low on edge hardware.
    num_layers: int
    # Hidden width sized to match domain-string complexity without paying for
    # unnecessary parameters.
    d_model: int
    # Multi-head attention count balances representational power against
    # compute footprint for GPU inference.
    nhead: int
    # Feed-forward expansion keeps the standard 4x ratio for strong accuracy
    # while maintaining throughput.
    ffn_mult: int = 4
    # Dropout of 10% reduces co-adaptation; higher rates slowed convergence in
    # experiments.
    dropout: float = 0.1
    # AdamW learning rate chosen from sweep; 3e-4 converged stably without
    # schedule tuning.
    lr: float = 3e-4
    # Weight decay curbs weight drift and improves generalization when
    # training on long-running streams.
    weight_decay: float = 0.01
    # Label smoothing discourages over-confident logits and improves
    # calibration on noisy threat labels.
    label_smoothing: float = 0.05
    # Sequence length covers observed second-level domains with spare room for
    # rare longer names.
    max_len: int = 64


PROFILES = {
    # ~3.2M params - for quick testing
    "tiny": Profile(num_layers=4, d_model=256, nhead=8),
    # ~10.8M params - balanced performance
    "small": Profile(num_layers=6, d_model=384, nhead=6),
}