ccss17 commited on
Commit
83ca46c
·
verified ·
1 Parent(s): 26c425c

Upload config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.py +40 -0
config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True, slots=True)
5
+ class Profile:
6
+ # Transformer depth; shallow stacks limit overfitting on modest datasets
7
+ # and keep inference latency low on edge hardware.
8
+ num_layers: int
9
+ # Hidden width sized to match domain-string complexity without paying for
10
+ # unnecessary parameters.
11
+ d_model: int
12
+ # Multi-head attention count balances representational power against
13
+ # compute footprint for GPU inference.
14
+ nhead: int
15
+ # Feed-forward expansion keeps the standard 4x ratio for strong accuracy
16
+ # while maintaining throughput.
17
+ ffn_mult: int = 4
18
+ # Dropout of 10% reduces co-adaptation; higher rates slowed convergence in
19
+ # experiments.
20
+ dropout: float = 0.1
21
+ # AdamW learning rate chosen from sweep; 3e-4 converged stably without
22
+ # schedule tuning.
23
+ lr: float = 3e-4
24
+ # Weight decay curbs weight drift and improves generalization when
25
+ # training on long-running streams.
26
+ weight_decay: float = 0.01
27
+ # Label smoothing discourages over-confident logits and improves
28
+ # calibration on noisy threat labels.
29
+ label_smoothing: float = 0.05
30
+ # Sequence length covers observed second-level domains with spare room for
31
+ # rare longer names.
32
+ max_len: int = 64
33
+
34
+
35
+ PROFILES = {
36
+ # ~3.2M params - for quick testing
37
+ "tiny": Profile(num_layers=4, d_model=256, nhead=8),
38
+ # ~10.8M params - balanced performance
39
+ "small": Profile(num_layers=6, d_model=384, nhead=6),
40
+ }