ccss17
/

dga-transformer-encoder

Model card Files Files and versions

dga-transformer-encoder / config.py

ccss17's picture

Upload config.py with huggingface_hub

83ca46c verified 5 months ago

history blame contribute delete

1.5 kB

	from dataclasses import dataclass


	@dataclass(frozen=True, slots=True)
	class Profile:
	# Transformer depth; shallow stacks limit overfitting on modest datasets
	# and keep inference latency low on edge hardware.
	num_layers: int
	# Hidden width sized to match domain-string complexity without paying for
	# unnecessary parameters.
	d_model: int
	# Multi-head attention count balances representational power against
	# compute footprint for GPU inference.
	nhead: int
	# Feed-forward expansion keeps the standard 4x ratio for strong accuracy
	# while maintaining throughput.
	ffn_mult: int = 4
	# Dropout of 10% reduces co-adaptation; higher rates slowed convergence in
	# experiments.
	dropout: float = 0.1
	# AdamW learning rate chosen from sweep; 3e-4 converged stably without
	# schedule tuning.
	lr: float = 3e-4
	# Weight decay curbs weight drift and improves generalization when
	# training on long-running streams.
	weight_decay: float = 0.01
	# Label smoothing discourages over-confident logits and improves
	# calibration on noisy threat labels.
	label_smoothing: float = 0.05
	# Sequence length covers observed second-level domains with spare room for
	# rare longer names.
	max_len: int = 64


	PROFILES = {
	# ~3.2M params - for quick testing
	"tiny": Profile(num_layers=4, d_model=256, nhead=8),
	# ~10.8M params - balanced performance
	"small": Profile(num_layers=6, d_model=384, nhead=6),
	}