bbkdevops's picture
download
raw
13.7 kB
"""TinyMind Omega — Model Configuration"""
from dataclasses import dataclass
@dataclass
class OmegaConfig:
# Vocabulary
vocab_size: int = 65_536
pad_token_id: int = 0
bos_token_id: int = 2
eos_token_id: int = 3
# Architecture dimensions
dim: int = 1024 # hidden dimension
n_layers: int = 24 # total layers
n_heads: int = 16 # attention heads
head_dim: int = 64 # dim per head (dim / n_heads)
ffn_mult: int = 4 # FFN hidden = dim * ffn_mult
# Hybrid layer pattern: "A"=linear attention, "S"=SSM, repeat
# e.g. SAAS SAAS ... → 1 SSM per 4 layers, rest attention
layer_pattern: str = "SAAS"
# TinyMind PureField core params. "P" layers use a bounded recurrent
# memory plus exact local window instead of full KV storage.
memory_slots: int = 4
memory_ranks: int = 32
local_window: int = 128
timescale_count: int = 4
contractive_eps: float = 1e-3
low_rank: int = 8
residual_alpha: float = 0.2
# SSM (Mamba-style) params
ssm_d_state: int = 16 # SSM state dimension
ssm_d_conv: int = 4 # SSM conv kernel size
ssm_expand: int = 2 # SSM inner dim = dim * expand
# KAN FFN params
kan_grid: int = 5 # spline grid points
kan_order: int = 3 # spline polynomial order
# Linear Attention params
feature_dim: int = 64 # random feature dimension for kernel approx
# PureLattice CNN stem. When enabled, token embeddings pass through a
# compact multi-scale convolutional adapter before the main Omega stack.
cnn_core_enabled: bool = False
cnn_hidden_mult: int = 2
cnn_kernel_sizes: tuple[int, ...] = (3, 5, 9)
cnn_dilations: tuple[int, ...] = (1, 2, 4)
cnn_residual_scale: float = 0.5
# Self-assessment core. When enabled, hidden states pass through a compact
# recursive assessor that estimates evidence/conflict/uncertainty and feeds
# a bounded correction back into the residual stream.
self_assessment_enabled: bool = False
self_assessment_inner_mult: int = 2
self_assessment_steps: int = 2
self_assessment_residual_scale: float = 0.15
self_assessment_frequency: int = 0
# Positional encoding
max_seq_len: int = 4096
rope_theta: float = 10_000.0
# Regularization
dropout: float = 0.1
attention_dropout: float = 0.0
# Training
tie_word_embeddings: bool = True
# Omega++ runtime and quality controls
architecture_mode: str = "omega"
precision_mode: str = "bf16_quality" # bf16_quality | int4_sparse_fast | int6_bridge_imma_fast | auto
sparsity_mode: str = "dense" # dense | int4_4x8_pairwise_sparse | int6_2x4_pairwise_sparse
int4_group_size: int = 64
sparse_recovery_steps: int = 2_000
quality_gate_delta: float = 0.05
verifier_passes: int = 2
retrieval_top_k: int = 4
# ReGenesis-KV exact hybrid memory controls
regen_kv_enabled: bool = False
archive_chunk_tokens: int = 8192
regen_kv_rank: int = 8
ledger_hash_mode: str = "sha256_merkle"
max_persistent_tokens: int = 10_000_000
@property
def n_ssm_layers(self) -> int:
return sum(1 for c in self.layer_pattern * (self.n_layers // len(self.layer_pattern) + 1)
if c == "S")
@property
def total_params_estimate(self) -> str:
emb = self.vocab_size * self.dim
per_layer = (
4 * self.dim * self.dim + # attention QKV+O
2 * self.dim * self.dim * self.ffn_mult + # FFN
self.dim * 4 # norms
)
total = emb + per_layer * self.n_layers
if total > 1e9:
return f"{total/1e9:.1f}B"
return f"{total/1e6:.0f}M"
# Preset configs
def tiny_config() -> OmegaConfig:
"""~120M params — fast iteration"""
return OmegaConfig(dim=512, n_layers=12, n_heads=8, head_dim=64)
def small_config() -> OmegaConfig:
"""~350M params — good quality"""
return OmegaConfig(dim=1024, n_layers=24, n_heads=16, head_dim=64)
def medium_config() -> OmegaConfig:
"""~1B params — max for 3090 24GB"""
return OmegaConfig(dim=2048, n_layers=24, n_heads=16, head_dim=128,
ssm_d_state=32, kan_grid=8)
def four_b_config() -> OmegaConfig:
"""~4B-class dense estimate for RTX 3090 experiments.
Full dense training of a 4B model is not realistic on a single 24GB 3090.
This preset is intended for checkpoint construction, sharded/offloaded
training, adapter/BitSharp tuning, INT4 export, and PureField/ReGenesis
long-memory experiments.
"""
return OmegaConfig(
dim=3072,
n_layers=36,
n_heads=24,
head_dim=128,
ffn_mult=4,
layer_pattern="SAAS",
ssm_d_state=48,
ssm_d_conv=4,
ssm_expand=2,
kan_grid=8,
kan_order=3,
feature_dim=96,
max_seq_len=8192,
rope_theta=500_000.0,
dropout=0.05,
)
def twelve_b_config() -> OmegaConfig:
"""~12B-class dense estimate for compressed/offloaded frontier experiments.
A single RTX 3090 cannot full-train this dense class with Adam states. This
preset is for PureField/ReGenesis 12B-class architecture planning, adapter
training, sharded/offloaded runs, INT4 2:4 sparse export, and evidence-led
quality gates.
"""
return OmegaConfig(
dim=5120,
n_layers=40,
n_heads=40,
head_dim=128,
ffn_mult=4,
layer_pattern="SAAS",
ssm_d_state=64,
ssm_d_conv=4,
ssm_expand=2,
kan_grid=10,
kan_order=3,
feature_dim=128,
max_seq_len=8192,
rope_theta=1_000_000.0,
dropout=0.04,
)
def omega_plus_config(size: str = "small") -> OmegaConfig:
"""Omega++ staged configs with logic/runtime controls enabled."""
presets = {
"tiny": tiny_config,
"small": small_config,
"medium": medium_config,
"4b": four_b_config,
"12b": twelve_b_config,
}
if size not in presets:
raise ValueError(f"unknown Omega++ size '{size}', expected one of {sorted(presets)}")
cfg = presets[size]()
cfg.architecture_mode = "omega_plus"
cfg.precision_mode = "bf16_quality"
cfg.sparsity_mode = "dense"
cfg.layer_pattern = "SAAS"
cfg.int4_group_size = 64
cfg.sparse_recovery_steps = 2_000 if size != "tiny" else 200
cfg.quality_gate_delta = 0.05
cfg.verifier_passes = 2
cfg.retrieval_top_k = 4
return cfg
def large_config() -> OmegaConfig:
"""~3B params — max intelligence on 3090 with int8 or gradient checkpointing
ใช้ context 4096 + SSM state ใหญ่ขึ้น + KAN grid ละเอียดกว่า
Train: ต้องใช้ gradient checkpointing + bf16 + grad_accum ≥ 16
Inference: ~6-8GB VRAM ด้วย int4
"""
return OmegaConfig(
dim=2560,
n_layers=32,
n_heads=20,
head_dim=128,
ffn_mult=4,
layer_pattern="SAAS",
ssm_d_state=48,
ssm_d_conv=4,
ssm_expand=2,
kan_grid=10,
kan_order=3,
feature_dim=80,
max_seq_len=4096,
rope_theta=500_000.0, # extended RoPE สำหรับ long context
dropout=0.05,
)
def spectral_config(size: str = "nano") -> OmegaConfig:
"""
SpectralMind configs — ใช้คู่กับ SpectralMindModel ใน spectral_compact.py
ขนาดจริงหลัง BloomEmbedding + StiefelLinear + LowRankFFN:
nano : ~2-3M params (attn_rank=16, ffn_rank=16, bloom_buckets=4096)
micro : ~6-8M params (attn_rank=32, ffn_rank=32, bloom_buckets=8192)
small : ~15-20M params (attn_rank=48, ffn_rank=48, bloom_buckets=16384)
เปรียบเทียบกับ standard:
tiny config = 120M → SpectralMind nano = 2-3M (50x smaller)
small config = 350M → SpectralMind micro = 6-8M (50x smaller)
การ train:
- ใช้ SpectralTrainer ใน train/spectral_trainer.py
- warmstart=True (closed-form init)
- rank จะ grow อัตโนมัติระหว่าง training
"""
presets = {
"nano": {
"dim": 256, "n_layers": 6, "n_heads": 4, "head_dim": 64,
"ffn_mult": 4, "max_seq_len": 512,
"attn_rank": 16, "ffn_rank": 16,
"bloom_buckets": 4096, "bloom_hashes": 4,
},
"micro": {
"dim": 384, "n_layers": 10, "n_heads": 6, "head_dim": 64,
"ffn_mult": 4, "max_seq_len": 1024,
"attn_rank": 32, "ffn_rank": 32,
"bloom_buckets": 8192, "bloom_hashes": 4,
},
"small": {
"dim": 512, "n_layers": 14, "n_heads": 8, "head_dim": 64,
"ffn_mult": 4, "max_seq_len": 2048,
"attn_rank": 48, "ffn_rank": 48,
"bloom_buckets": 16384, "bloom_hashes": 4,
},
}
if size not in presets:
raise ValueError(f"unknown spectral size '{size}', expected one of {sorted(presets)}")
p = presets[size]
cfg = OmegaConfig(
dim=p["dim"],
n_layers=p["n_layers"],
n_heads=p["n_heads"],
head_dim=p["head_dim"],
ffn_mult=p["ffn_mult"],
max_seq_len=p["max_seq_len"],
dropout=0.05,
tie_word_embeddings=False, # BloomEmbedding ไม่ tie กับ lm_head
architecture_mode="spectral",
precision_mode="bf16_quality",
)
# เก็บ spectral params ไว้ใน memory_slots/ranks field (reuse existing dataclass)
cfg.memory_slots = p["bloom_hashes"] # bloom_hashes
cfg.memory_ranks = p["bloom_buckets"] // 256 # bloom_buckets / 256 (normalized)
cfg.low_rank = p["attn_rank"] # attn_rank
cfg.ssm_d_state = p["ffn_rank"] # ffn_rank (reuse field)
return cfg
def spectral_hyperparams(cfg: OmegaConfig) -> dict:
"""คืนค่า spectral-specific hyperparams จาก OmegaConfig"""
return {
"attn_rank": cfg.low_rank,
"ffn_rank": cfg.ssm_d_state,
"bloom_buckets": cfg.memory_ranks * 256,
"bloom_hashes": cfg.memory_slots,
}
def purefield_config(size: str = "small") -> OmegaConfig:
"""TinyMind PureField configs with original bounded-memory core enabled."""
presets = {
"tiny": tiny_config,
"small": small_config,
"medium": medium_config,
"4b": four_b_config,
"12b": twelve_b_config,
}
if size not in presets:
raise ValueError(f"unknown PureField size '{size}', expected one of {sorted(presets)}")
cfg = presets[size]()
cfg.architecture_mode = "purefield"
cfg.layer_pattern = "P"
cfg.precision_mode = "bf16_quality"
cfg.sparsity_mode = "dense"
cfg.int4_group_size = 64
cfg.quality_gate_delta = 0.05
if size == "tiny":
cfg.memory_slots = 4
cfg.memory_ranks = 16
cfg.local_window = 128
cfg.timescale_count = 4
cfg.low_rank = 8
cfg.sparse_recovery_steps = 200
elif size == "small":
cfg.memory_slots = 6
cfg.memory_ranks = 32
cfg.local_window = 256
cfg.timescale_count = 6
cfg.low_rank = 16
cfg.sparse_recovery_steps = 2_000
elif size == "medium":
cfg.memory_slots = 8
cfg.memory_ranks = 64
cfg.local_window = 512
cfg.timescale_count = 8
cfg.low_rank = 32
cfg.sparse_recovery_steps = 4_000
elif size == "4b":
cfg.memory_slots = 12
cfg.memory_ranks = 128
cfg.local_window = 1024
cfg.timescale_count = 12
cfg.low_rank = 64
cfg.sparse_recovery_steps = 8_000
cfg.precision_mode = "auto"
cfg.sparsity_mode = "int4_4x8_pairwise_sparse"
else:
cfg.memory_slots = 16
cfg.memory_ranks = 192
cfg.local_window = 2048
cfg.timescale_count = 16
cfg.low_rank = 96
cfg.sparse_recovery_steps = 16_000
cfg.precision_mode = "int6_bridge_imma_fast"
cfg.sparsity_mode = "int6_2x4_pairwise_sparse"
cfg.contractive_eps = 1e-3
cfg.residual_alpha = min(1.0, cfg.n_layers ** -0.5)
cfg.regen_kv_enabled = True
cfg.archive_chunk_tokens = 8192
cfg.retrieval_top_k = 8
cfg.regen_kv_rank = max(4, cfg.low_rank)
cfg.ledger_hash_mode = "sha256_merkle"
cfg.max_persistent_tokens = 10_000_000
return cfg
def axiomweave_config(size: str = "tiny") -> OmegaConfig:
"""AxiomWeave routed synthesis configs.
This preset is designed for experiments that combine attention, SSM,
PureField memory, KAN nonlinearity, ReGenesis-ready exact memory, and INT4
sparse export readiness in one model family.
"""
presets = {
"tiny": tiny_config,
"small": small_config,
"medium": medium_config,
"4b": four_b_config,
"12b": twelve_b_config,
}
if size not in presets:
raise ValueError(f"unknown AxiomWeave size '{size}', expected one of {sorted(presets)}")
cfg = purefield_config(size)
cfg.architecture_mode = "axiomweave"
cfg.layer_pattern = "W"
cfg.regen_kv_enabled = False
cfg.verifier_passes = 3
cfg.retrieval_top_k = 12
cfg.ledger_hash_mode = "sha256_merkle"
cfg.precision_mode = "int6_bridge_imma_fast" if size == "12b" else ("auto" if size == "4b" else "bf16_quality")
cfg.sparsity_mode = "int6_2x4_pairwise_sparse" if size == "12b" else ("int4_4x8_pairwise_sparse" if size == "4b" else "dense")
return cfg

Xet Storage Details

Size:
13.7 kB
·
Xet hash:
6c1bdf06ea0cb9a5bc43db2261c696d9107d2831ab01e7a7cf7e63cad9bc11fb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.