File size: 6,279 Bytes

9463e5c

"""
GLADIUS v2.0 — Kernel Configuration

Every hyperparameter in one place. No magic numbers anywhere else.
Hardware target: Intel i3-1005G1, 16GB RAM, no GPU.
"""

from dataclasses import dataclass, field
from typing import Optional
import torch


@dataclass
class KernelConfig:
    """Master configuration for the GLADIUS kernel."""

    # === Model Dimensions ===
    vocab_size: int = 32_000          # BPE vocabulary (SentencePiece)
    hidden_dim: int = 256             # Core representation width
    num_layers: int = 6               # Transformer depth
    num_heads: int = 8                # Attention heads
    head_dim: int = 32                # hidden_dim / num_heads
    ffn_dim: int = 1024               # SwiGLU intermediate (4x hidden)
    max_seq_len: int = 512            # Context window

    # === Memory: Hot ===
    hot_memory_slots: int = 512       # Learned KV cache slots
    hot_importance_threshold: float = 0.5  # Gate threshold for writes

    # === Memory: Warm ===
    warm_rank: int = 16               # LoRA/Locas adapter rank
    warm_condition_threshold: float = 10.0  # σ_max/σ_min rebalance trigger
    warm_balance_frequency: int = 100  # Steps between spectral health checks
    warm_novelty_threshold: float = 0.1  # Share subspace evolution trigger
    warm_checkpoint_interval: int = 300  # Steps between disk checkpoints

    # === Memory: Cold (HEKTOR) ===
    cold_embedding_dim: int = 256     # Match hidden_dim for direct injection
    cold_top_k: int = 4              # Retrieval results per query

    # === Time Engine ===
    time_dim: int = 32               # Temporal encoding dimension
    time_num_frequencies: int = 16    # Time2Vec learned frequencies
    time_max_events: int = 64        # Relative clock event buffer
    clock_mode: str = 'continuous'   # 'continuous' (Time2Vec) or 'lattice' (LatticeClock)
    lattice_size: int = 256          # Number of discrete positions per scale
    lattice_scales: int = 4          # Number of temporal scales

    # === Cognition ===
    cognition_state_dim: int = 64    # State monitor hidden size
    cognition_modes: int = 4          # active, monitoring, reflective, dormant
    cognition_prompt_types: int = 5   # curiosity, consolidation, planning, monitoring, creative

    # === Modulator ===
    register_dim: int = 4             # formal↔casual, technical↔simple, concise↔elaborate, warm↔detached
    intent_dim: int = 4               # inform, persuade, comfort, challenge
    silence_threshold: float = 0.7    # Silence gate activation

    # === Tool Cortex ===
    max_tools: int = 64              # Tool registry capacity
    tool_activation_threshold: float = 0.6  # Cosine sim threshold for invocation

    # === Router / Specialists ===
    num_specialists: int = 4          # reasoning, math, code, general
    router_top_k: int = 2            # Activate top-k specialists per token

    # === Attention: SLA2 Hybrid ===
    attention_sparse_budget: int = 64  # Top-k tokens for softmax path
    attention_alpha_init: float = 0.5  # Initial blend ratio (learnable)

    # === Training ===
    learning_rate: float = 3e-4
    weight_decay: float = 0.01
    warmup_steps: int = 500
    max_grad_norm: float = 1.0
    batch_size: int = 4               # CPU constraint
    accumulation_steps: int = 8       # Effective batch = 32

    # === Infrastructure ===
    device: str = 'cpu'
    dtype: torch.dtype = torch.float32
    checkpoint_dir: str = 'checkpoints'
    seed: int = 42

    # === Special Tokens ===
    pad_token_id: int = 0
    bos_token_id: int = 1
    eos_token_id: int = 2
    unk_token_id: int = 3

    def __post_init__(self):
        assert self.hidden_dim == self.num_heads * self.head_dim, \
            f"hidden_dim ({self.hidden_dim}) must equal num_heads ({self.num_heads}) * head_dim ({self.head_dim})"
        assert self.hidden_dim == self.cold_embedding_dim, \
            "cold_embedding_dim must match hidden_dim for direct injection"

    @property
    def estimated_params(self) -> dict:
        """Rough parameter count per component."""
        embed = self.vocab_size * self.hidden_dim * 2  # token + output head
        attn_per_layer = 4 * self.hidden_dim * self.hidden_dim  # Q, K, V, O
        ffn_per_layer = 3 * self.hidden_dim * self.ffn_dim  # gate, up, down (SwiGLU)
        transformer = self.num_layers * (attn_per_layer + ffn_per_layer)
        hot_mem = 2 * self.hot_memory_slots * self.hidden_dim  # keys + values
        warm_mem = self.num_layers * 3 * self.hidden_dim * self.warm_rank  # per-layer Locas
        time_eng = self.time_dim * self.time_num_frequencies * 4  # rough
        cognition = self.cognition_state_dim * self.hidden_dim * 4  # rough
        modulator = (self.register_dim + self.intent_dim) * self.vocab_size  # bias layer
        tools = self.max_tools * self.hidden_dim * 3  # embeddings + cross-attn
        total = embed + transformer + hot_mem + warm_mem + time_eng + cognition + modulator + tools

        return {
            'embeddings': embed,
            'transformer': transformer,
            'hot_memory': hot_mem,
            'warm_memory': warm_mem,
            'time_engine': time_eng,
            'cognition': cognition,
            'modulator': modulator,
            'tool_cortex': tools,
            'total': total,
            'total_MB_f32': total * 4 / 1024 / 1024,
            'total_MB_f16': total * 2 / 1024 / 1024,
        }


# === Presets ===

def tiny_config() -> KernelConfig:
    """For tests. ~1M params."""
    return KernelConfig(
        vocab_size=1000, hidden_dim=64, num_layers=2, num_heads=4, head_dim=16,
        ffn_dim=256, max_seq_len=128, hot_memory_slots=32, warm_rank=4,
        max_tools=8, num_specialists=2, cold_embedding_dim=64,
    )

def dev_config() -> KernelConfig:
    """For development. ~10M params. Fast iteration on CPU."""
    return KernelConfig(
        vocab_size=8000, hidden_dim=128, num_layers=4, num_heads=4, head_dim=32,
        ffn_dim=512, max_seq_len=256, hot_memory_slots=128, warm_rank=8,
        max_tools=16, num_specialists=2, cold_embedding_dim=128,
    )

def full_config() -> KernelConfig:
    """Target configuration. ~30M params."""
    return KernelConfig()  # Defaults are the full config