Buckets:
| """TinyMind Omega — Model Configuration""" | |
| from dataclasses import dataclass | |
| class OmegaConfig: | |
| # Vocabulary | |
| vocab_size: int = 65_536 | |
| pad_token_id: int = 0 | |
| bos_token_id: int = 2 | |
| eos_token_id: int = 3 | |
| # Architecture dimensions | |
| dim: int = 1024 # hidden dimension | |
| n_layers: int = 24 # total layers | |
| n_heads: int = 16 # attention heads | |
| head_dim: int = 64 # dim per head (dim / n_heads) | |
| ffn_mult: int = 4 # FFN hidden = dim * ffn_mult | |
| # Hybrid layer pattern: "A"=linear attention, "S"=SSM, repeat | |
| # e.g. SAAS SAAS ... → 1 SSM per 4 layers, rest attention | |
| layer_pattern: str = "SAAS" | |
| # TinyMind PureField core params. "P" layers use a bounded recurrent | |
| # memory plus exact local window instead of full KV storage. | |
| memory_slots: int = 4 | |
| memory_ranks: int = 32 | |
| local_window: int = 128 | |
| timescale_count: int = 4 | |
| contractive_eps: float = 1e-3 | |
| low_rank: int = 8 | |
| residual_alpha: float = 0.2 | |
| # SSM (Mamba-style) params | |
| ssm_d_state: int = 16 # SSM state dimension | |
| ssm_d_conv: int = 4 # SSM conv kernel size | |
| ssm_expand: int = 2 # SSM inner dim = dim * expand | |
| # KAN FFN params | |
| kan_grid: int = 5 # spline grid points | |
| kan_order: int = 3 # spline polynomial order | |
| # Linear Attention params | |
| feature_dim: int = 64 # random feature dimension for kernel approx | |
| # PureLattice CNN stem. When enabled, token embeddings pass through a | |
| # compact multi-scale convolutional adapter before the main Omega stack. | |
| cnn_core_enabled: bool = False | |
| cnn_hidden_mult: int = 2 | |
| cnn_kernel_sizes: tuple[int, ...] = (3, 5, 9) | |
| cnn_dilations: tuple[int, ...] = (1, 2, 4) | |
| cnn_residual_scale: float = 0.5 | |
| # Self-assessment core. When enabled, hidden states pass through a compact | |
| # recursive assessor that estimates evidence/conflict/uncertainty and feeds | |
| # a bounded correction back into the residual stream. | |
| self_assessment_enabled: bool = False | |
| self_assessment_inner_mult: int = 2 | |
| self_assessment_steps: int = 2 | |
| self_assessment_residual_scale: float = 0.15 | |
| self_assessment_frequency: int = 0 | |
| # Positional encoding | |
| max_seq_len: int = 4096 | |
| rope_theta: float = 10_000.0 | |
| # Regularization | |
| dropout: float = 0.1 | |
| attention_dropout: float = 0.0 | |
| # Training | |
| tie_word_embeddings: bool = True | |
| # Omega++ runtime and quality controls | |
| architecture_mode: str = "omega" | |
| precision_mode: str = "bf16_quality" # bf16_quality | int4_sparse_fast | int6_bridge_imma_fast | auto | |
| sparsity_mode: str = "dense" # dense | int4_4x8_pairwise_sparse | int6_2x4_pairwise_sparse | |
| int4_group_size: int = 64 | |
| sparse_recovery_steps: int = 2_000 | |
| quality_gate_delta: float = 0.05 | |
| verifier_passes: int = 2 | |
| retrieval_top_k: int = 4 | |
| # ReGenesis-KV exact hybrid memory controls | |
| regen_kv_enabled: bool = False | |
| archive_chunk_tokens: int = 8192 | |
| regen_kv_rank: int = 8 | |
| ledger_hash_mode: str = "sha256_merkle" | |
| max_persistent_tokens: int = 10_000_000 | |
| def n_ssm_layers(self) -> int: | |
| return sum(1 for c in self.layer_pattern * (self.n_layers // len(self.layer_pattern) + 1) | |
| if c == "S") | |
| def total_params_estimate(self) -> str: | |
| emb = self.vocab_size * self.dim | |
| per_layer = ( | |
| 4 * self.dim * self.dim + # attention QKV+O | |
| 2 * self.dim * self.dim * self.ffn_mult + # FFN | |
| self.dim * 4 # norms | |
| ) | |
| total = emb + per_layer * self.n_layers | |
| if total > 1e9: | |
| return f"{total/1e9:.1f}B" | |
| return f"{total/1e6:.0f}M" | |
| # Preset configs | |
| def tiny_config() -> OmegaConfig: | |
| """~120M params — fast iteration""" | |
| return OmegaConfig(dim=512, n_layers=12, n_heads=8, head_dim=64) | |
| def small_config() -> OmegaConfig: | |
| """~350M params — good quality""" | |
| return OmegaConfig(dim=1024, n_layers=24, n_heads=16, head_dim=64) | |
| def medium_config() -> OmegaConfig: | |
| """~1B params — max for 3090 24GB""" | |
| return OmegaConfig(dim=2048, n_layers=24, n_heads=16, head_dim=128, | |
| ssm_d_state=32, kan_grid=8) | |
| def four_b_config() -> OmegaConfig: | |
| """~4B-class dense estimate for RTX 3090 experiments. | |
| Full dense training of a 4B model is not realistic on a single 24GB 3090. | |
| This preset is intended for checkpoint construction, sharded/offloaded | |
| training, adapter/BitSharp tuning, INT4 export, and PureField/ReGenesis | |
| long-memory experiments. | |
| """ | |
| return OmegaConfig( | |
| dim=3072, | |
| n_layers=36, | |
| n_heads=24, | |
| head_dim=128, | |
| ffn_mult=4, | |
| layer_pattern="SAAS", | |
| ssm_d_state=48, | |
| ssm_d_conv=4, | |
| ssm_expand=2, | |
| kan_grid=8, | |
| kan_order=3, | |
| feature_dim=96, | |
| max_seq_len=8192, | |
| rope_theta=500_000.0, | |
| dropout=0.05, | |
| ) | |
| def twelve_b_config() -> OmegaConfig: | |
| """~12B-class dense estimate for compressed/offloaded frontier experiments. | |
| A single RTX 3090 cannot full-train this dense class with Adam states. This | |
| preset is for PureField/ReGenesis 12B-class architecture planning, adapter | |
| training, sharded/offloaded runs, INT4 2:4 sparse export, and evidence-led | |
| quality gates. | |
| """ | |
| return OmegaConfig( | |
| dim=5120, | |
| n_layers=40, | |
| n_heads=40, | |
| head_dim=128, | |
| ffn_mult=4, | |
| layer_pattern="SAAS", | |
| ssm_d_state=64, | |
| ssm_d_conv=4, | |
| ssm_expand=2, | |
| kan_grid=10, | |
| kan_order=3, | |
| feature_dim=128, | |
| max_seq_len=8192, | |
| rope_theta=1_000_000.0, | |
| dropout=0.04, | |
| ) | |
| def omega_plus_config(size: str = "small") -> OmegaConfig: | |
| """Omega++ staged configs with logic/runtime controls enabled.""" | |
| presets = { | |
| "tiny": tiny_config, | |
| "small": small_config, | |
| "medium": medium_config, | |
| "4b": four_b_config, | |
| "12b": twelve_b_config, | |
| } | |
| if size not in presets: | |
| raise ValueError(f"unknown Omega++ size '{size}', expected one of {sorted(presets)}") | |
| cfg = presets[size]() | |
| cfg.architecture_mode = "omega_plus" | |
| cfg.precision_mode = "bf16_quality" | |
| cfg.sparsity_mode = "dense" | |
| cfg.layer_pattern = "SAAS" | |
| cfg.int4_group_size = 64 | |
| cfg.sparse_recovery_steps = 2_000 if size != "tiny" else 200 | |
| cfg.quality_gate_delta = 0.05 | |
| cfg.verifier_passes = 2 | |
| cfg.retrieval_top_k = 4 | |
| return cfg | |
| def large_config() -> OmegaConfig: | |
| """~3B params — max intelligence on 3090 with int8 or gradient checkpointing | |
| ใช้ context 4096 + SSM state ใหญ่ขึ้น + KAN grid ละเอียดกว่า | |
| Train: ต้องใช้ gradient checkpointing + bf16 + grad_accum ≥ 16 | |
| Inference: ~6-8GB VRAM ด้วย int4 | |
| """ | |
| return OmegaConfig( | |
| dim=2560, | |
| n_layers=32, | |
| n_heads=20, | |
| head_dim=128, | |
| ffn_mult=4, | |
| layer_pattern="SAAS", | |
| ssm_d_state=48, | |
| ssm_d_conv=4, | |
| ssm_expand=2, | |
| kan_grid=10, | |
| kan_order=3, | |
| feature_dim=80, | |
| max_seq_len=4096, | |
| rope_theta=500_000.0, # extended RoPE สำหรับ long context | |
| dropout=0.05, | |
| ) | |
| def spectral_config(size: str = "nano") -> OmegaConfig: | |
| """ | |
| SpectralMind configs — ใช้คู่กับ SpectralMindModel ใน spectral_compact.py | |
| ขนาดจริงหลัง BloomEmbedding + StiefelLinear + LowRankFFN: | |
| nano : ~2-3M params (attn_rank=16, ffn_rank=16, bloom_buckets=4096) | |
| micro : ~6-8M params (attn_rank=32, ffn_rank=32, bloom_buckets=8192) | |
| small : ~15-20M params (attn_rank=48, ffn_rank=48, bloom_buckets=16384) | |
| เปรียบเทียบกับ standard: | |
| tiny config = 120M → SpectralMind nano = 2-3M (50x smaller) | |
| small config = 350M → SpectralMind micro = 6-8M (50x smaller) | |
| การ train: | |
| - ใช้ SpectralTrainer ใน train/spectral_trainer.py | |
| - warmstart=True (closed-form init) | |
| - rank จะ grow อัตโนมัติระหว่าง training | |
| """ | |
| presets = { | |
| "nano": { | |
| "dim": 256, "n_layers": 6, "n_heads": 4, "head_dim": 64, | |
| "ffn_mult": 4, "max_seq_len": 512, | |
| "attn_rank": 16, "ffn_rank": 16, | |
| "bloom_buckets": 4096, "bloom_hashes": 4, | |
| }, | |
| "micro": { | |
| "dim": 384, "n_layers": 10, "n_heads": 6, "head_dim": 64, | |
| "ffn_mult": 4, "max_seq_len": 1024, | |
| "attn_rank": 32, "ffn_rank": 32, | |
| "bloom_buckets": 8192, "bloom_hashes": 4, | |
| }, | |
| "small": { | |
| "dim": 512, "n_layers": 14, "n_heads": 8, "head_dim": 64, | |
| "ffn_mult": 4, "max_seq_len": 2048, | |
| "attn_rank": 48, "ffn_rank": 48, | |
| "bloom_buckets": 16384, "bloom_hashes": 4, | |
| }, | |
| } | |
| if size not in presets: | |
| raise ValueError(f"unknown spectral size '{size}', expected one of {sorted(presets)}") | |
| p = presets[size] | |
| cfg = OmegaConfig( | |
| dim=p["dim"], | |
| n_layers=p["n_layers"], | |
| n_heads=p["n_heads"], | |
| head_dim=p["head_dim"], | |
| ffn_mult=p["ffn_mult"], | |
| max_seq_len=p["max_seq_len"], | |
| dropout=0.05, | |
| tie_word_embeddings=False, # BloomEmbedding ไม่ tie กับ lm_head | |
| architecture_mode="spectral", | |
| precision_mode="bf16_quality", | |
| ) | |
| # เก็บ spectral params ไว้ใน memory_slots/ranks field (reuse existing dataclass) | |
| cfg.memory_slots = p["bloom_hashes"] # bloom_hashes | |
| cfg.memory_ranks = p["bloom_buckets"] // 256 # bloom_buckets / 256 (normalized) | |
| cfg.low_rank = p["attn_rank"] # attn_rank | |
| cfg.ssm_d_state = p["ffn_rank"] # ffn_rank (reuse field) | |
| return cfg | |
| def spectral_hyperparams(cfg: OmegaConfig) -> dict: | |
| """คืนค่า spectral-specific hyperparams จาก OmegaConfig""" | |
| return { | |
| "attn_rank": cfg.low_rank, | |
| "ffn_rank": cfg.ssm_d_state, | |
| "bloom_buckets": cfg.memory_ranks * 256, | |
| "bloom_hashes": cfg.memory_slots, | |
| } | |
| def purefield_config(size: str = "small") -> OmegaConfig: | |
| """TinyMind PureField configs with original bounded-memory core enabled.""" | |
| presets = { | |
| "tiny": tiny_config, | |
| "small": small_config, | |
| "medium": medium_config, | |
| "4b": four_b_config, | |
| "12b": twelve_b_config, | |
| } | |
| if size not in presets: | |
| raise ValueError(f"unknown PureField size '{size}', expected one of {sorted(presets)}") | |
| cfg = presets[size]() | |
| cfg.architecture_mode = "purefield" | |
| cfg.layer_pattern = "P" | |
| cfg.precision_mode = "bf16_quality" | |
| cfg.sparsity_mode = "dense" | |
| cfg.int4_group_size = 64 | |
| cfg.quality_gate_delta = 0.05 | |
| if size == "tiny": | |
| cfg.memory_slots = 4 | |
| cfg.memory_ranks = 16 | |
| cfg.local_window = 128 | |
| cfg.timescale_count = 4 | |
| cfg.low_rank = 8 | |
| cfg.sparse_recovery_steps = 200 | |
| elif size == "small": | |
| cfg.memory_slots = 6 | |
| cfg.memory_ranks = 32 | |
| cfg.local_window = 256 | |
| cfg.timescale_count = 6 | |
| cfg.low_rank = 16 | |
| cfg.sparse_recovery_steps = 2_000 | |
| elif size == "medium": | |
| cfg.memory_slots = 8 | |
| cfg.memory_ranks = 64 | |
| cfg.local_window = 512 | |
| cfg.timescale_count = 8 | |
| cfg.low_rank = 32 | |
| cfg.sparse_recovery_steps = 4_000 | |
| elif size == "4b": | |
| cfg.memory_slots = 12 | |
| cfg.memory_ranks = 128 | |
| cfg.local_window = 1024 | |
| cfg.timescale_count = 12 | |
| cfg.low_rank = 64 | |
| cfg.sparse_recovery_steps = 8_000 | |
| cfg.precision_mode = "auto" | |
| cfg.sparsity_mode = "int4_4x8_pairwise_sparse" | |
| else: | |
| cfg.memory_slots = 16 | |
| cfg.memory_ranks = 192 | |
| cfg.local_window = 2048 | |
| cfg.timescale_count = 16 | |
| cfg.low_rank = 96 | |
| cfg.sparse_recovery_steps = 16_000 | |
| cfg.precision_mode = "int6_bridge_imma_fast" | |
| cfg.sparsity_mode = "int6_2x4_pairwise_sparse" | |
| cfg.contractive_eps = 1e-3 | |
| cfg.residual_alpha = min(1.0, cfg.n_layers ** -0.5) | |
| cfg.regen_kv_enabled = True | |
| cfg.archive_chunk_tokens = 8192 | |
| cfg.retrieval_top_k = 8 | |
| cfg.regen_kv_rank = max(4, cfg.low_rank) | |
| cfg.ledger_hash_mode = "sha256_merkle" | |
| cfg.max_persistent_tokens = 10_000_000 | |
| return cfg | |
| def axiomweave_config(size: str = "tiny") -> OmegaConfig: | |
| """AxiomWeave routed synthesis configs. | |
| This preset is designed for experiments that combine attention, SSM, | |
| PureField memory, KAN nonlinearity, ReGenesis-ready exact memory, and INT4 | |
| sparse export readiness in one model family. | |
| """ | |
| presets = { | |
| "tiny": tiny_config, | |
| "small": small_config, | |
| "medium": medium_config, | |
| "4b": four_b_config, | |
| "12b": twelve_b_config, | |
| } | |
| if size not in presets: | |
| raise ValueError(f"unknown AxiomWeave size '{size}', expected one of {sorted(presets)}") | |
| cfg = purefield_config(size) | |
| cfg.architecture_mode = "axiomweave" | |
| cfg.layer_pattern = "W" | |
| cfg.regen_kv_enabled = False | |
| cfg.verifier_passes = 3 | |
| cfg.retrieval_top_k = 12 | |
| cfg.ledger_hash_mode = "sha256_merkle" | |
| cfg.precision_mode = "int6_bridge_imma_fast" if size == "12b" else ("auto" if size == "4b" else "bf16_quality") | |
| cfg.sparsity_mode = "int6_2x4_pairwise_sparse" if size == "12b" else ("int4_4x8_pairwise_sparse" if size == "4b" else "dense") | |
| return cfg | |
Xet Storage Details
- Size:
- 13.7 kB
- Xet hash:
- 6c1bdf06ea0cb9a5bc43db2261c696d9107d2831ab01e7a7cf7e63cad9bc11fb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.