File size: 6,279 Bytes
9463e5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
GLADIUS v2.0 — Kernel Configuration

Every hyperparameter in one place. No magic numbers anywhere else.
Hardware target: Intel i3-1005G1, 16GB RAM, no GPU.
"""

from dataclasses import dataclass, field
from typing import Optional
import torch


@dataclass
class KernelConfig:
    """Master configuration for the GLADIUS kernel."""

    # === Model Dimensions ===
    vocab_size: int = 32_000          # BPE vocabulary (SentencePiece)
    hidden_dim: int = 256             # Core representation width
    num_layers: int = 6               # Transformer depth
    num_heads: int = 8                # Attention heads
    head_dim: int = 32                # hidden_dim / num_heads
    ffn_dim: int = 1024               # SwiGLU intermediate (4x hidden)
    max_seq_len: int = 512            # Context window

    # === Memory: Hot ===
    hot_memory_slots: int = 512       # Learned KV cache slots
    hot_importance_threshold: float = 0.5  # Gate threshold for writes

    # === Memory: Warm ===
    warm_rank: int = 16               # LoRA/Locas adapter rank
    warm_condition_threshold: float = 10.0  # σ_max/σ_min rebalance trigger
    warm_balance_frequency: int = 100  # Steps between spectral health checks
    warm_novelty_threshold: float = 0.1  # Share subspace evolution trigger
    warm_checkpoint_interval: int = 300  # Steps between disk checkpoints

    # === Memory: Cold (HEKTOR) ===
    cold_embedding_dim: int = 256     # Match hidden_dim for direct injection
    cold_top_k: int = 4              # Retrieval results per query

    # === Time Engine ===
    time_dim: int = 32               # Temporal encoding dimension
    time_num_frequencies: int = 16    # Time2Vec learned frequencies
    time_max_events: int = 64        # Relative clock event buffer
    clock_mode: str = 'continuous'   # 'continuous' (Time2Vec) or 'lattice' (LatticeClock)
    lattice_size: int = 256          # Number of discrete positions per scale
    lattice_scales: int = 4          # Number of temporal scales

    # === Cognition ===
    cognition_state_dim: int = 64    # State monitor hidden size
    cognition_modes: int = 4          # active, monitoring, reflective, dormant
    cognition_prompt_types: int = 5   # curiosity, consolidation, planning, monitoring, creative

    # === Modulator ===
    register_dim: int = 4             # formal↔casual, technical↔simple, concise↔elaborate, warm↔detached
    intent_dim: int = 4               # inform, persuade, comfort, challenge
    silence_threshold: float = 0.7    # Silence gate activation

    # === Tool Cortex ===
    max_tools: int = 64              # Tool registry capacity
    tool_activation_threshold: float = 0.6  # Cosine sim threshold for invocation

    # === Router / Specialists ===
    num_specialists: int = 4          # reasoning, math, code, general
    router_top_k: int = 2            # Activate top-k specialists per token

    # === Attention: SLA2 Hybrid ===
    attention_sparse_budget: int = 64  # Top-k tokens for softmax path
    attention_alpha_init: float = 0.5  # Initial blend ratio (learnable)

    # === Training ===
    learning_rate: float = 3e-4
    weight_decay: float = 0.01
    warmup_steps: int = 500
    max_grad_norm: float = 1.0
    batch_size: int = 4               # CPU constraint
    accumulation_steps: int = 8       # Effective batch = 32

    # === Infrastructure ===
    device: str = 'cpu'
    dtype: torch.dtype = torch.float32
    checkpoint_dir: str = 'checkpoints'
    seed: int = 42

    # === Special Tokens ===
    pad_token_id: int = 0
    bos_token_id: int = 1
    eos_token_id: int = 2
    unk_token_id: int = 3

    def __post_init__(self):
        assert self.hidden_dim == self.num_heads * self.head_dim, \
            f"hidden_dim ({self.hidden_dim}) must equal num_heads ({self.num_heads}) * head_dim ({self.head_dim})"
        assert self.hidden_dim == self.cold_embedding_dim, \
            "cold_embedding_dim must match hidden_dim for direct injection"

    @property
    def estimated_params(self) -> dict:
        """Rough parameter count per component."""
        embed = self.vocab_size * self.hidden_dim * 2  # token + output head
        attn_per_layer = 4 * self.hidden_dim * self.hidden_dim  # Q, K, V, O
        ffn_per_layer = 3 * self.hidden_dim * self.ffn_dim  # gate, up, down (SwiGLU)
        transformer = self.num_layers * (attn_per_layer + ffn_per_layer)
        hot_mem = 2 * self.hot_memory_slots * self.hidden_dim  # keys + values
        warm_mem = self.num_layers * 3 * self.hidden_dim * self.warm_rank  # per-layer Locas
        time_eng = self.time_dim * self.time_num_frequencies * 4  # rough
        cognition = self.cognition_state_dim * self.hidden_dim * 4  # rough
        modulator = (self.register_dim + self.intent_dim) * self.vocab_size  # bias layer
        tools = self.max_tools * self.hidden_dim * 3  # embeddings + cross-attn
        total = embed + transformer + hot_mem + warm_mem + time_eng + cognition + modulator + tools

        return {
            'embeddings': embed,
            'transformer': transformer,
            'hot_memory': hot_mem,
            'warm_memory': warm_mem,
            'time_engine': time_eng,
            'cognition': cognition,
            'modulator': modulator,
            'tool_cortex': tools,
            'total': total,
            'total_MB_f32': total * 4 / 1024 / 1024,
            'total_MB_f16': total * 2 / 1024 / 1024,
        }


# === Presets ===

def tiny_config() -> KernelConfig:
    """For tests. ~1M params."""
    return KernelConfig(
        vocab_size=1000, hidden_dim=64, num_layers=2, num_heads=4, head_dim=16,
        ffn_dim=256, max_seq_len=128, hot_memory_slots=32, warm_rank=4,
        max_tools=8, num_specialists=2, cold_embedding_dim=64,
    )

def dev_config() -> KernelConfig:
    """For development. ~10M params. Fast iteration on CPU."""
    return KernelConfig(
        vocab_size=8000, hidden_dim=128, num_layers=4, num_heads=4, head_dim=32,
        ffn_dim=512, max_seq_len=256, hot_memory_slots=128, warm_rank=8,
        max_tools=16, num_specialists=2, cold_embedding_dim=128,
    )

def full_config() -> KernelConfig:
    """Target configuration. ~30M params."""
    return KernelConfig()  # Defaults are the full config