Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /model /config.py

bbkdevops

about 1 month ago

download

raw

13.7 kB

	"""TinyMind Omega — Model Configuration"""
	from dataclasses import dataclass


	@dataclass
	class OmegaConfig:
	# Vocabulary
	vocab_size: int = 65_536
	pad_token_id: int = 0
	bos_token_id: int = 2
	eos_token_id: int = 3

	# Architecture dimensions
	dim: int = 1024 # hidden dimension
	n_layers: int = 24 # total layers
	n_heads: int = 16 # attention heads
	head_dim: int = 64 # dim per head (dim / n_heads)
	ffn_mult: int = 4 # FFN hidden = dim * ffn_mult

	# Hybrid layer pattern: "A"=linear attention, "S"=SSM, repeat
	# e.g. SAAS SAAS ... → 1 SSM per 4 layers, rest attention
	layer_pattern: str = "SAAS"

	# TinyMind PureField core params. "P" layers use a bounded recurrent
	# memory plus exact local window instead of full KV storage.
	memory_slots: int = 4
	memory_ranks: int = 32
	local_window: int = 128
	timescale_count: int = 4
	contractive_eps: float = 1e-3
	low_rank: int = 8
	residual_alpha: float = 0.2

	# SSM (Mamba-style) params
	ssm_d_state: int = 16 # SSM state dimension
	ssm_d_conv: int = 4 # SSM conv kernel size
	ssm_expand: int = 2 # SSM inner dim = dim * expand

	# KAN FFN params
	kan_grid: int = 5 # spline grid points
	kan_order: int = 3 # spline polynomial order

	# Linear Attention params
	feature_dim: int = 64 # random feature dimension for kernel approx

	# PureLattice CNN stem. When enabled, token embeddings pass through a
	# compact multi-scale convolutional adapter before the main Omega stack.
	cnn_core_enabled: bool = False
	cnn_hidden_mult: int = 2
	cnn_kernel_sizes: tuple[int, ...] = (3, 5, 9)
	cnn_dilations: tuple[int, ...] = (1, 2, 4)
	cnn_residual_scale: float = 0.5

	# Self-assessment core. When enabled, hidden states pass through a compact
	# recursive assessor that estimates evidence/conflict/uncertainty and feeds
	# a bounded correction back into the residual stream.
	self_assessment_enabled: bool = False
	self_assessment_inner_mult: int = 2
	self_assessment_steps: int = 2
	self_assessment_residual_scale: float = 0.15
	self_assessment_frequency: int = 0

	# Positional encoding
	max_seq_len: int = 4096
	rope_theta: float = 10_000.0

	# Regularization
	dropout: float = 0.1
	attention_dropout: float = 0.0

	# Training
	tie_word_embeddings: bool = True

	# Omega++ runtime and quality controls
	architecture_mode: str = "omega"
	precision_mode: str = "bf16_quality" # bf16_quality \| int4_sparse_fast \| int6_bridge_imma_fast \| auto
	sparsity_mode: str = "dense" # dense \| int4_4x8_pairwise_sparse \| int6_2x4_pairwise_sparse
	int4_group_size: int = 64
	sparse_recovery_steps: int = 2_000
	quality_gate_delta: float = 0.05
	verifier_passes: int = 2
	retrieval_top_k: int = 4

	# ReGenesis-KV exact hybrid memory controls
	regen_kv_enabled: bool = False
	archive_chunk_tokens: int = 8192
	regen_kv_rank: int = 8
	ledger_hash_mode: str = "sha256_merkle"
	max_persistent_tokens: int = 10_000_000

	@property
	def n_ssm_layers(self) -> int:
	return sum(1 for c in self.layer_pattern * (self.n_layers // len(self.layer_pattern) + 1)
	if c == "S")

	@property
	def total_params_estimate(self) -> str:
	emb = self.vocab_size * self.dim
	per_layer = (
	4 * self.dim * self.dim + # attention QKV+O
	2 * self.dim * self.dim * self.ffn_mult + # FFN
	self.dim * 4 # norms
	)
	total = emb + per_layer * self.n_layers
	if total > 1e9:
	return f"{total/1e9:.1f}B"
	return f"{total/1e6:.0f}M"


	# Preset configs
	def tiny_config() -> OmegaConfig:
	"""~120M params — fast iteration"""
	return OmegaConfig(dim=512, n_layers=12, n_heads=8, head_dim=64)

	def small_config() -> OmegaConfig:
	"""~350M params — good quality"""
	return OmegaConfig(dim=1024, n_layers=24, n_heads=16, head_dim=64)

	def medium_config() -> OmegaConfig:
	"""~1B params — max for 3090 24GB"""
	return OmegaConfig(dim=2048, n_layers=24, n_heads=16, head_dim=128,
	ssm_d_state=32, kan_grid=8)


	def four_b_config() -> OmegaConfig:
	"""~4B-class dense estimate for RTX 3090 experiments.

	Full dense training of a 4B model is not realistic on a single 24GB 3090.
	This preset is intended for checkpoint construction, sharded/offloaded
	training, adapter/BitSharp tuning, INT4 export, and PureField/ReGenesis
	long-memory experiments.
	"""
	return OmegaConfig(
	dim=3072,
	n_layers=36,
	n_heads=24,
	head_dim=128,
	ffn_mult=4,
	layer_pattern="SAAS",
	ssm_d_state=48,
	ssm_d_conv=4,
	ssm_expand=2,
	kan_grid=8,
	kan_order=3,
	feature_dim=96,
	max_seq_len=8192,
	rope_theta=500_000.0,
	dropout=0.05,
	)


	def twelve_b_config() -> OmegaConfig:
	"""~12B-class dense estimate for compressed/offloaded frontier experiments.

	A single RTX 3090 cannot full-train this dense class with Adam states. This
	preset is for PureField/ReGenesis 12B-class architecture planning, adapter
	training, sharded/offloaded runs, INT4 2:4 sparse export, and evidence-led
	quality gates.
	"""
	return OmegaConfig(
	dim=5120,
	n_layers=40,
	n_heads=40,
	head_dim=128,
	ffn_mult=4,
	layer_pattern="SAAS",
	ssm_d_state=64,
	ssm_d_conv=4,
	ssm_expand=2,
	kan_grid=10,
	kan_order=3,
	feature_dim=128,
	max_seq_len=8192,
	rope_theta=1_000_000.0,
	dropout=0.04,
	)


	def omega_plus_config(size: str = "small") -> OmegaConfig:
	"""Omega++ staged configs with logic/runtime controls enabled."""
	presets = {
	"tiny": tiny_config,
	"small": small_config,
	"medium": medium_config,
	"4b": four_b_config,
	"12b": twelve_b_config,
	}
	if size not in presets:
	raise ValueError(f"unknown Omega++ size '{size}', expected one of {sorted(presets)}")

	cfg = presets[size]()
	cfg.architecture_mode = "omega_plus"
	cfg.precision_mode = "bf16_quality"
	cfg.sparsity_mode = "dense"
	cfg.layer_pattern = "SAAS"
	cfg.int4_group_size = 64
	cfg.sparse_recovery_steps = 2_000 if size != "tiny" else 200
	cfg.quality_gate_delta = 0.05
	cfg.verifier_passes = 2
	cfg.retrieval_top_k = 4
	return cfg


	def large_config() -> OmegaConfig:
	"""~3B params — max intelligence on 3090 with int8 or gradient checkpointing

	ใช้ context 4096 + SSM state ใหญ่ขึ้น + KAN grid ละเอียดกว่า
	Train: ต้องใช้ gradient checkpointing + bf16 + grad_accum ≥ 16
	Inference: ~6-8GB VRAM ด้วย int4
	"""
	return OmegaConfig(
	dim=2560,
	n_layers=32,
	n_heads=20,
	head_dim=128,
	ffn_mult=4,
	layer_pattern="SAAS",
	ssm_d_state=48,
	ssm_d_conv=4,
	ssm_expand=2,
	kan_grid=10,
	kan_order=3,
	feature_dim=80,
	max_seq_len=4096,
	rope_theta=500_000.0, # extended RoPE สำหรับ long context
	dropout=0.05,
	)


	def spectral_config(size: str = "nano") -> OmegaConfig:
	"""
	SpectralMind configs — ใช้คู่กับ SpectralMindModel ใน spectral_compact.py

	ขนาดจริงหลัง BloomEmbedding + StiefelLinear + LowRankFFN:

	nano : ~2-3M params (attn_rank=16, ffn_rank=16, bloom_buckets=4096)
	micro : ~6-8M params (attn_rank=32, ffn_rank=32, bloom_buckets=8192)
	small : ~15-20M params (attn_rank=48, ffn_rank=48, bloom_buckets=16384)

	เปรียบเทียบกับ standard:
	tiny config = 120M → SpectralMind nano = 2-3M (50x smaller)
	small config = 350M → SpectralMind micro = 6-8M (50x smaller)

	การ train:
	- ใช้ SpectralTrainer ใน train/spectral_trainer.py
	- warmstart=True (closed-form init)
	- rank จะ grow อัตโนมัติระหว่าง training
	"""
	presets = {
	"nano": {
	"dim": 256, "n_layers": 6, "n_heads": 4, "head_dim": 64,
	"ffn_mult": 4, "max_seq_len": 512,
	"attn_rank": 16, "ffn_rank": 16,
	"bloom_buckets": 4096, "bloom_hashes": 4,
	},
	"micro": {
	"dim": 384, "n_layers": 10, "n_heads": 6, "head_dim": 64,
	"ffn_mult": 4, "max_seq_len": 1024,
	"attn_rank": 32, "ffn_rank": 32,
	"bloom_buckets": 8192, "bloom_hashes": 4,
	},
	"small": {
	"dim": 512, "n_layers": 14, "n_heads": 8, "head_dim": 64,
	"ffn_mult": 4, "max_seq_len": 2048,
	"attn_rank": 48, "ffn_rank": 48,
	"bloom_buckets": 16384, "bloom_hashes": 4,
	},
	}
	if size not in presets:
	raise ValueError(f"unknown spectral size '{size}', expected one of {sorted(presets)}")

	p = presets[size]
	cfg = OmegaConfig(
	dim=p["dim"],
	n_layers=p["n_layers"],
	n_heads=p["n_heads"],
	head_dim=p["head_dim"],
	ffn_mult=p["ffn_mult"],
	max_seq_len=p["max_seq_len"],
	dropout=0.05,
	tie_word_embeddings=False, # BloomEmbedding ไม่ tie กับ lm_head
	architecture_mode="spectral",
	precision_mode="bf16_quality",
	)
	# เก็บ spectral params ไว้ใน memory_slots/ranks field (reuse existing dataclass)
	cfg.memory_slots = p["bloom_hashes"] # bloom_hashes
	cfg.memory_ranks = p["bloom_buckets"] // 256 # bloom_buckets / 256 (normalized)
	cfg.low_rank = p["attn_rank"] # attn_rank
	cfg.ssm_d_state = p["ffn_rank"] # ffn_rank (reuse field)
	return cfg


	def spectral_hyperparams(cfg: OmegaConfig) -> dict:
	"""คืนค่า spectral-specific hyperparams จาก OmegaConfig"""
	return {
	"attn_rank": cfg.low_rank,
	"ffn_rank": cfg.ssm_d_state,
	"bloom_buckets": cfg.memory_ranks * 256,
	"bloom_hashes": cfg.memory_slots,
	}


	def purefield_config(size: str = "small") -> OmegaConfig:
	"""TinyMind PureField configs with original bounded-memory core enabled."""
	presets = {
	"tiny": tiny_config,
	"small": small_config,
	"medium": medium_config,
	"4b": four_b_config,
	"12b": twelve_b_config,
	}
	if size not in presets:
	raise ValueError(f"unknown PureField size '{size}', expected one of {sorted(presets)}")

	cfg = presets[size]()
	cfg.architecture_mode = "purefield"
	cfg.layer_pattern = "P"
	cfg.precision_mode = "bf16_quality"
	cfg.sparsity_mode = "dense"
	cfg.int4_group_size = 64
	cfg.quality_gate_delta = 0.05

	if size == "tiny":
	cfg.memory_slots = 4
	cfg.memory_ranks = 16
	cfg.local_window = 128
	cfg.timescale_count = 4
	cfg.low_rank = 8
	cfg.sparse_recovery_steps = 200
	elif size == "small":
	cfg.memory_slots = 6
	cfg.memory_ranks = 32
	cfg.local_window = 256
	cfg.timescale_count = 6
	cfg.low_rank = 16
	cfg.sparse_recovery_steps = 2_000
	elif size == "medium":
	cfg.memory_slots = 8
	cfg.memory_ranks = 64
	cfg.local_window = 512
	cfg.timescale_count = 8
	cfg.low_rank = 32
	cfg.sparse_recovery_steps = 4_000
	elif size == "4b":
	cfg.memory_slots = 12
	cfg.memory_ranks = 128
	cfg.local_window = 1024
	cfg.timescale_count = 12
	cfg.low_rank = 64
	cfg.sparse_recovery_steps = 8_000
	cfg.precision_mode = "auto"
	cfg.sparsity_mode = "int4_4x8_pairwise_sparse"
	else:
	cfg.memory_slots = 16
	cfg.memory_ranks = 192
	cfg.local_window = 2048
	cfg.timescale_count = 16
	cfg.low_rank = 96
	cfg.sparse_recovery_steps = 16_000
	cfg.precision_mode = "int6_bridge_imma_fast"
	cfg.sparsity_mode = "int6_2x4_pairwise_sparse"

	cfg.contractive_eps = 1e-3
	cfg.residual_alpha = min(1.0, cfg.n_layers ** -0.5)
	cfg.regen_kv_enabled = True
	cfg.archive_chunk_tokens = 8192
	cfg.retrieval_top_k = 8
	cfg.regen_kv_rank = max(4, cfg.low_rank)
	cfg.ledger_hash_mode = "sha256_merkle"
	cfg.max_persistent_tokens = 10_000_000
	return cfg


	def axiomweave_config(size: str = "tiny") -> OmegaConfig:
	"""AxiomWeave routed synthesis configs.

	This preset is designed for experiments that combine attention, SSM,
	PureField memory, KAN nonlinearity, ReGenesis-ready exact memory, and INT4
	sparse export readiness in one model family.
	"""
	presets = {
	"tiny": tiny_config,
	"small": small_config,
	"medium": medium_config,
	"4b": four_b_config,
	"12b": twelve_b_config,
	}
	if size not in presets:
	raise ValueError(f"unknown AxiomWeave size '{size}', expected one of {sorted(presets)}")
	cfg = purefield_config(size)
	cfg.architecture_mode = "axiomweave"
	cfg.layer_pattern = "W"
	cfg.regen_kv_enabled = False
	cfg.verifier_passes = 3
	cfg.retrieval_top_k = 12
	cfg.ledger_hash_mode = "sha256_merkle"
	cfg.precision_mode = "int6_bridge_imma_fast" if size == "12b" else ("auto" if size == "4b" else "bf16_quality")
	cfg.sparsity_mode = "int6_2x4_pairwise_sparse" if size == "12b" else ("int4_4x8_pairwise_sparse" if size == "4b" else "dense")
	return cfg

Xet Storage Details

Size:: 13.7 kB
Xet hash:: 6c1bdf06ea0cb9a5bc43db2261c696d9107d2831ab01e7a7cf7e63cad9bc11fb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.