ccss17's picture
Upload config.py with huggingface_hub
83ca46c verified
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class Profile:
# Transformer depth; shallow stacks limit overfitting on modest datasets
# and keep inference latency low on edge hardware.
num_layers: int
# Hidden width sized to match domain-string complexity without paying for
# unnecessary parameters.
d_model: int
# Multi-head attention count balances representational power against
# compute footprint for GPU inference.
nhead: int
# Feed-forward expansion keeps the standard 4x ratio for strong accuracy
# while maintaining throughput.
ffn_mult: int = 4
# Dropout of 10% reduces co-adaptation; higher rates slowed convergence in
# experiments.
dropout: float = 0.1
# AdamW learning rate chosen from sweep; 3e-4 converged stably without
# schedule tuning.
lr: float = 3e-4
# Weight decay curbs weight drift and improves generalization when
# training on long-running streams.
weight_decay: float = 0.01
# Label smoothing discourages over-confident logits and improves
# calibration on noisy threat labels.
label_smoothing: float = 0.05
# Sequence length covers observed second-level domains with spare room for
# rare longer names.
max_len: int = 64
PROFILES = {
# ~3.2M params - for quick testing
"tiny": Profile(num_layers=4, d_model=256, nhead=8),
# ~10.8M params - balanced performance
"small": Profile(num_layers=6, d_model=384, nhead=6),
}