Upload config.py with huggingface_hub
Browse files
config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@dataclass(frozen=True, slots=True)
|
| 5 |
+
class Profile:
|
| 6 |
+
# Transformer depth; shallow stacks limit overfitting on modest datasets
|
| 7 |
+
# and keep inference latency low on edge hardware.
|
| 8 |
+
num_layers: int
|
| 9 |
+
# Hidden width sized to match domain-string complexity without paying for
|
| 10 |
+
# unnecessary parameters.
|
| 11 |
+
d_model: int
|
| 12 |
+
# Multi-head attention count balances representational power against
|
| 13 |
+
# compute footprint for GPU inference.
|
| 14 |
+
nhead: int
|
| 15 |
+
# Feed-forward expansion keeps the standard 4x ratio for strong accuracy
|
| 16 |
+
# while maintaining throughput.
|
| 17 |
+
ffn_mult: int = 4
|
| 18 |
+
# Dropout of 10% reduces co-adaptation; higher rates slowed convergence in
|
| 19 |
+
# experiments.
|
| 20 |
+
dropout: float = 0.1
|
| 21 |
+
# AdamW learning rate chosen from sweep; 3e-4 converged stably without
|
| 22 |
+
# schedule tuning.
|
| 23 |
+
lr: float = 3e-4
|
| 24 |
+
# Weight decay curbs weight drift and improves generalization when
|
| 25 |
+
# training on long-running streams.
|
| 26 |
+
weight_decay: float = 0.01
|
| 27 |
+
# Label smoothing discourages over-confident logits and improves
|
| 28 |
+
# calibration on noisy threat labels.
|
| 29 |
+
label_smoothing: float = 0.05
|
| 30 |
+
# Sequence length covers observed second-level domains with spare room for
|
| 31 |
+
# rare longer names.
|
| 32 |
+
max_len: int = 64
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
PROFILES = {
|
| 36 |
+
# ~3.2M params - for quick testing
|
| 37 |
+
"tiny": Profile(num_layers=4, d_model=256, nhead=8),
|
| 38 |
+
# ~10.8M params - balanced performance
|
| 39 |
+
"small": Profile(num_layers=6, d_model=384, nhead=6),
|
| 40 |
+
}
|