File size: 6,279 Bytes
9463e5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | """
GLADIUS v2.0 — Kernel Configuration
Every hyperparameter in one place. No magic numbers anywhere else.
Hardware target: Intel i3-1005G1, 16GB RAM, no GPU.
"""
from dataclasses import dataclass, field
from typing import Optional
import torch
@dataclass
class KernelConfig:
"""Master configuration for the GLADIUS kernel."""
# === Model Dimensions ===
vocab_size: int = 32_000 # BPE vocabulary (SentencePiece)
hidden_dim: int = 256 # Core representation width
num_layers: int = 6 # Transformer depth
num_heads: int = 8 # Attention heads
head_dim: int = 32 # hidden_dim / num_heads
ffn_dim: int = 1024 # SwiGLU intermediate (4x hidden)
max_seq_len: int = 512 # Context window
# === Memory: Hot ===
hot_memory_slots: int = 512 # Learned KV cache slots
hot_importance_threshold: float = 0.5 # Gate threshold for writes
# === Memory: Warm ===
warm_rank: int = 16 # LoRA/Locas adapter rank
warm_condition_threshold: float = 10.0 # σ_max/σ_min rebalance trigger
warm_balance_frequency: int = 100 # Steps between spectral health checks
warm_novelty_threshold: float = 0.1 # Share subspace evolution trigger
warm_checkpoint_interval: int = 300 # Steps between disk checkpoints
# === Memory: Cold (HEKTOR) ===
cold_embedding_dim: int = 256 # Match hidden_dim for direct injection
cold_top_k: int = 4 # Retrieval results per query
# === Time Engine ===
time_dim: int = 32 # Temporal encoding dimension
time_num_frequencies: int = 16 # Time2Vec learned frequencies
time_max_events: int = 64 # Relative clock event buffer
clock_mode: str = 'continuous' # 'continuous' (Time2Vec) or 'lattice' (LatticeClock)
lattice_size: int = 256 # Number of discrete positions per scale
lattice_scales: int = 4 # Number of temporal scales
# === Cognition ===
cognition_state_dim: int = 64 # State monitor hidden size
cognition_modes: int = 4 # active, monitoring, reflective, dormant
cognition_prompt_types: int = 5 # curiosity, consolidation, planning, monitoring, creative
# === Modulator ===
register_dim: int = 4 # formal↔casual, technical↔simple, concise↔elaborate, warm↔detached
intent_dim: int = 4 # inform, persuade, comfort, challenge
silence_threshold: float = 0.7 # Silence gate activation
# === Tool Cortex ===
max_tools: int = 64 # Tool registry capacity
tool_activation_threshold: float = 0.6 # Cosine sim threshold for invocation
# === Router / Specialists ===
num_specialists: int = 4 # reasoning, math, code, general
router_top_k: int = 2 # Activate top-k specialists per token
# === Attention: SLA2 Hybrid ===
attention_sparse_budget: int = 64 # Top-k tokens for softmax path
attention_alpha_init: float = 0.5 # Initial blend ratio (learnable)
# === Training ===
learning_rate: float = 3e-4
weight_decay: float = 0.01
warmup_steps: int = 500
max_grad_norm: float = 1.0
batch_size: int = 4 # CPU constraint
accumulation_steps: int = 8 # Effective batch = 32
# === Infrastructure ===
device: str = 'cpu'
dtype: torch.dtype = torch.float32
checkpoint_dir: str = 'checkpoints'
seed: int = 42
# === Special Tokens ===
pad_token_id: int = 0
bos_token_id: int = 1
eos_token_id: int = 2
unk_token_id: int = 3
def __post_init__(self):
assert self.hidden_dim == self.num_heads * self.head_dim, \
f"hidden_dim ({self.hidden_dim}) must equal num_heads ({self.num_heads}) * head_dim ({self.head_dim})"
assert self.hidden_dim == self.cold_embedding_dim, \
"cold_embedding_dim must match hidden_dim for direct injection"
@property
def estimated_params(self) -> dict:
"""Rough parameter count per component."""
embed = self.vocab_size * self.hidden_dim * 2 # token + output head
attn_per_layer = 4 * self.hidden_dim * self.hidden_dim # Q, K, V, O
ffn_per_layer = 3 * self.hidden_dim * self.ffn_dim # gate, up, down (SwiGLU)
transformer = self.num_layers * (attn_per_layer + ffn_per_layer)
hot_mem = 2 * self.hot_memory_slots * self.hidden_dim # keys + values
warm_mem = self.num_layers * 3 * self.hidden_dim * self.warm_rank # per-layer Locas
time_eng = self.time_dim * self.time_num_frequencies * 4 # rough
cognition = self.cognition_state_dim * self.hidden_dim * 4 # rough
modulator = (self.register_dim + self.intent_dim) * self.vocab_size # bias layer
tools = self.max_tools * self.hidden_dim * 3 # embeddings + cross-attn
total = embed + transformer + hot_mem + warm_mem + time_eng + cognition + modulator + tools
return {
'embeddings': embed,
'transformer': transformer,
'hot_memory': hot_mem,
'warm_memory': warm_mem,
'time_engine': time_eng,
'cognition': cognition,
'modulator': modulator,
'tool_cortex': tools,
'total': total,
'total_MB_f32': total * 4 / 1024 / 1024,
'total_MB_f16': total * 2 / 1024 / 1024,
}
# === Presets ===
def tiny_config() -> KernelConfig:
"""For tests. ~1M params."""
return KernelConfig(
vocab_size=1000, hidden_dim=64, num_layers=2, num_heads=4, head_dim=16,
ffn_dim=256, max_seq_len=128, hot_memory_slots=32, warm_rank=4,
max_tools=8, num_specialists=2, cold_embedding_dim=64,
)
def dev_config() -> KernelConfig:
"""For development. ~10M params. Fast iteration on CPU."""
return KernelConfig(
vocab_size=8000, hidden_dim=128, num_layers=4, num_heads=4, head_dim=32,
ffn_dim=512, max_seq_len=256, hot_memory_slots=128, warm_rank=8,
max_tools=16, num_specialists=2, cold_embedding_dim=128,
)
def full_config() -> KernelConfig:
"""Target configuration. ~30M params."""
return KernelConfig() # Defaults are the full config
|