Spaces:

RumleyRum
/

Deterministic-Governance-Mechanism

Sleeping

App Files Files Community

Deterministic-Governance-Mechanism / substrate_sharding.py

RumleyRum

Upload 22 files

2ca80ad verified about 2 months ago

raw

history blame contribute delete

19.8 kB

	#!/usr/bin/env python3
	"""
	Substrate Sharding Strategy for Material-Field Governance
	Cache-Aware Design for 250,000+ Verified States

	Goal: Keep most relevant substrate anchors in L1 cache (32-64KB)
	while maintaining sub-microsecond access times and deterministic behavior.

	Architecture:
	1. Hierarchical Sharding: Coarse → Fine-grained locality
	2. Semantic Clustering: Group related knowledge domains
	3. LRU Cache Management: Keep hot shards resident
	4. Deterministic Retrieval: Same query → same shards loaded

	Reference Implementation - Verhash LLC
	Patent Priority: January 25, 2026
	"""

	import math
	from dataclasses import dataclass
	from typing import List, Tuple, Optional, Dict, Set
	from collections import OrderedDict
	import hashlib
	import struct

	from deterministic_rng import normal


	# ==============================================================================
	# CACHE ARCHITECTURE CONSTANTS
	# ==============================================================================

	L1_CACHE_SIZE = 32 * 1024 # 32 KB typical L1 data cache
	L2_CACHE_SIZE = 256 * 1024 # 256 KB typical L2 cache
	L3_CACHE_SIZE = 8 * 1024 * 1024 # 8 MB typical L3 cache

	# Use explicit little-endian packing with fixed sizes for deterministic layout.
	# Store x/y as float64 to match Python's `float` (C double on CPython).
	VECTOR_SIZE_BYTES = 8 * 2 # 2 floats (x, y) × 8 bytes each = 16 bytes
	# Real: 768 floats × 4 bytes = 3KB per vector

	# L1 budget: Reserve space for working set + shard metadata
	L1_BUDGET_VECTORS = (L1_CACHE_SIZE // 2) // VECTOR_SIZE_BYTES # ~1000 vectors in L1
	L2_BUDGET_VECTORS = (L2_CACHE_SIZE // 2) // VECTOR_SIZE_BYTES # ~8000 vectors in L2


	# ==============================================================================
	# VECTOR REPRESENTATION
	# ==============================================================================

	@dataclass
	class CompactVector:
	"""
	Cache-optimized vector representation.

	In production:
	- Use float16 instead of float64 (halve memory)
	- Pack metadata into single 64-bit word
	- Align to cache line boundaries (64 bytes)
	"""
	x: float
	y: float
	shard_id: int # Which shard this belongs to
	domain_hash: int # Semantic domain identifier

	def to_bytes(self) -> bytes:
	"""Pack into contiguous bytes for cache efficiency"""
	return struct.pack('<ddIQ', float(self.x), float(self.y), int(self.shard_id), int(self.domain_hash))

	@staticmethod
	def from_bytes(data: bytes) -> 'CompactVector':
	"""Unpack from contiguous bytes"""
	x, y, shard_id, domain_hash = struct.unpack('<ddIQ', data)
	return CompactVector(x, y, shard_id, domain_hash)

	def distance_to(self, other: 'CompactVector') -> float:
	"""Euclidean distance"""
	return math.sqrt((self.x - other.x) 2 + (self.y - other.y) 2)

	def dot_product(self, other: 'CompactVector') -> float:
	"""Normalized dot product (cosine similarity)"""
	self_norm = math.sqrt(self.x 2 + self.y 2)
	other_norm = math.sqrt(other.x 2 + other.y 2)

	if self_norm == 0 or other_norm == 0:
	return 0.0

	return (self.x * other.x + self.y * other.y) / (self_norm * other_norm)


	# ==============================================================================
	# SHARD STRUCTURE
	# ==============================================================================

	@dataclass
	class SubstrateShard:
	"""
	A shard is a contiguous block of verified states.

	Design:
	- Fixed size (e.g., 64 vectors = 1KB in 2D, 192KB in 768D)
	- Cache-line aligned
	- Sorted by locality (clustered semantically)
	- Immutable after creation (no reallocation)
	"""
	shard_id: int
	domain: str # e.g., "biology", "physics", "geography"
	centroid: CompactVector # Representative center of this shard
	vectors: List[CompactVector] # The actual verified states
	size_bytes: int = 0 # Memory footprint

	def __post_init__(self):
	if self.size_bytes == 0:
	self.size_bytes = len(self.vectors) * VECTOR_SIZE_BYTES

	def is_l1_resident(self) -> bool:
	"""Can this shard fit in L1?"""
	return self.size_bytes <= L1_CACHE_SIZE // 4 # Reserve 75% for other data

	def relevance_score(self, query_vector: CompactVector) -> float:
	"""
	Compute relevance of this shard to query.
	Uses centroid distance as fast approximation.
	"""
	return 1.0 / (1.0 + query_vector.distance_to(self.centroid))


	# ==============================================================================
	# HIERARCHICAL SHARDING STRATEGY
	# ==============================================================================

	class ShardedSubstrate:
	"""
	Hierarchical substrate sharding with cache-aware retrieval.

	Architecture:
	Level 1 (L1 Cache): Active shard (~64 vectors)
	Level 2 (L2 Cache): Hot shards (~8 shards)
	Level 3 (L3 Cache): Warm shards (~100 shards)
	Level 4 (Main RAM): All shards (250K vectors / 64 = ~3906 shards)

	Retrieval Strategy:
	1. Hash query to domain (deterministic)
	2. Load relevant domain shards into L3
	3. Rank by centroid distance
	4. Prefetch top-k shards into L2
	5. Keep most relevant shard in L1
	"""

	def __init__(self,
	shard_size: int = 64,
	l1_capacity: int = 1, # Number of shards in L1
	l2_capacity: int = 8, # Number of shards in L2
	l3_capacity: int = 100): # Number of shards in L3

	self.shard_size = shard_size
	self.l1_capacity = l1_capacity
	self.l2_capacity = l2_capacity
	self.l3_capacity = l3_capacity

	# Storage hierarchy
	self.all_shards: Dict[int, SubstrateShard] = {} # All shards (RAM)
	self.domain_index: Dict[str, List[int]] = {} # Domain → shard IDs

	# Cache tiers (LRU with size limits)
	self.l1_cache: OrderedDict[int, SubstrateShard] = OrderedDict()
	self.l2_cache: OrderedDict[int, SubstrateShard] = OrderedDict()
	self.l3_cache: OrderedDict[int, SubstrateShard] = OrderedDict()

	# Statistics
	self.l1_hits = 0
	self.l2_hits = 0
	self.l3_hits = 0
	self.l1_misses = 0

	def add_shard(self, shard: SubstrateShard):
	"""Add a shard to the substrate"""
	self.all_shards[shard.shard_id] = shard

	# Update domain index
	if shard.domain not in self.domain_index:
	self.domain_index[shard.domain] = []
	self.domain_index[shard.domain].append(shard.shard_id)

	def _get_domain_from_query(self, query_vector: CompactVector) -> str:
	"""
	Deterministically map query to semantic domain.

	In production:
	- Use learned classifier
	- Or: hash query vector to domain buckets
	- Or: use query context/tags
	"""
	# Simplified: use domain_hash embedded in query
	if query_vector.domain_hash % 3 == 0:
	return "biology"
	elif query_vector.domain_hash % 3 == 1:
	return "geography"
	else:
	return "physics"

	def _evict_lru(self, cache: OrderedDict, capacity: int):
	"""Evict least recently used items to maintain capacity"""
	while len(cache) > capacity:
	cache.popitem(last=False) # Remove oldest

	def _promote_to_l1(self, shard_id: int):
	"""Move shard to L1 cache"""
	shard = self.all_shards[shard_id]

	# Remove from lower tiers
	self.l2_cache.pop(shard_id, None)
	self.l3_cache.pop(shard_id, None)

	# Add to L1 (most recent)
	self.l1_cache[shard_id] = shard
	self.l1_cache.move_to_end(shard_id)

	# Evict if over capacity
	self._evict_lru(self.l1_cache, self.l1_capacity)

	def _promote_to_l2(self, shard_id: int):
	"""Move shard to L2 cache"""
	shard = self.all_shards[shard_id]

	# Remove from L3
	self.l3_cache.pop(shard_id, None)

	# Add to L2
	self.l2_cache[shard_id] = shard
	self.l2_cache.move_to_end(shard_id)

	self._evict_lru(self.l2_cache, self.l2_capacity)

	def _promote_to_l3(self, shard_id: int):
	"""Move shard to L3 cache"""
	shard = self.all_shards[shard_id]

	# Add to L3
	self.l3_cache[shard_id] = shard
	self.l3_cache.move_to_end(shard_id)

	self._evict_lru(self.l3_cache, self.l3_capacity)

	def retrieve_relevant_shards(self,
	query_vector: CompactVector,
	top_k: int = 8) -> List[SubstrateShard]:
	"""
	Retrieve most relevant shards for query.
	Cache-aware with deterministic retrieval.

	Strategy:
	1. Determine domain (deterministic hash)
	2. Get candidate shards from domain
	3. Rank by centroid distance
	4. Load top-k into L2
	5. Return in order of relevance
	"""

	# Step 1: Get domain
	domain = self._get_domain_from_query(query_vector)

	# Step 2: Get candidate shard IDs from domain
	if domain not in self.domain_index:
	return [] # No shards for this domain

	candidate_ids = self.domain_index[domain]

	# Step 3: Rank shards by relevance
	shard_scores = [
	(shard_id, self.all_shards[shard_id].relevance_score(query_vector))
	for shard_id in candidate_ids
	]
	shard_scores.sort(key=lambda x: x[1], reverse=True)

	# Step 4: Load top-k shards into cache hierarchy
	top_shard_ids = [shard_id for shard_id, _ in shard_scores[:top_k]]

	# Promote most relevant to L1
	if top_shard_ids:
	self._promote_to_l1(top_shard_ids[0])
	self.l1_hits += 1

	# Promote rest to L2
	for shard_id in top_shard_ids[1:]:
	if shard_id not in self.l2_cache:
	self._promote_to_l2(shard_id)
	self.l2_hits += 1

	# Return shards in order of relevance
	return [self.all_shards[sid] for sid in top_shard_ids]

	def get_l1_vectors(self) -> List[CompactVector]:
	"""
	Get all vectors currently in L1 cache.
	This is the "hot path" for inference.
	"""
	vectors = []
	for shard in self.l1_cache.values():
	vectors.extend(shard.vectors)
	return vectors

	def print_stats(self):
	"""Print cache statistics"""
	print("\n" + "=" * 80)
	print("SUBSTRATE SHARDING STATISTICS")
	print("=" * 80)
	print(f"Total shards: {len(self.all_shards)}")
	print(f"Total vectors: {sum(len(s.vectors) for s in self.all_shards.values())}")
	print(f"Domains: {list(self.domain_index.keys())}")
	print()
	print(f"L1 Cache ({self.l1_capacity} shards max):")
	print(f" Current: {len(self.l1_cache)} shards")
	print(f" Vectors: {sum(len(s.vectors) for s in self.l1_cache.values())}")
	print(f" Size: {sum(s.size_bytes for s in self.l1_cache.values())} bytes")
	print(f" Hits: {self.l1_hits}")
	print()
	print(f"L2 Cache ({self.l2_capacity} shards max):")
	print(f" Current: {len(self.l2_cache)} shards")
	print(f" Vectors: {sum(len(s.vectors) for s in self.l2_cache.values())}")
	print(f" Hits: {self.l2_hits}")
	print()
	print(f"L3 Cache ({self.l3_capacity} shards max):")
	print(f" Current: {len(self.l3_cache)} shards")
	print(f" Vectors: {sum(len(s.vectors) for s in self.l3_cache.values())}")
	print(f" Hits: {self.l3_hits}")
	print("=" * 80)


	# ==============================================================================
	# DEMONSTRATION: 250K VERIFIED STATES
	# ==============================================================================

	def create_demo_substrate(num_vectors: int = 250000,
	shard_size: int = 64) -> ShardedSubstrate:
	"""
	Create a demo substrate with 250K verified states.
	Simulates three domains: biology, geography, physics.
	"""

	print(f"Creating substrate with {num_vectors:,} verified states...")
	print(f"Shard size: {shard_size} vectors")
	print(f"Expected shards: {num_vectors // shard_size}")

	substrate = ShardedSubstrate(shard_size=shard_size)

	domains = ["biology", "geography", "physics"]
	vectors_per_domain = num_vectors // len(domains)

	shard_id = 0

	for domain_idx, domain in enumerate(domains):
	print(f"\nGenerating {vectors_per_domain:,} vectors for domain: {domain}")

	# Generate vectors clustered around domain centroid
	# Biology: centered around (0.8, 0.8)
	# Geography: centered around (0.5, 0.5)
	# Physics: centered around (0.2, 0.2)

	if domain == "biology":
	center_x, center_y = 0.8, 0.8
	elif domain == "geography":
	center_x, center_y = 0.5, 0.5
	else:
	center_x, center_y = 0.2, 0.2

	domain_hash = hashlib.sha256(domain.encode()).digest()[:8]
	domain_hash_int = int.from_bytes(domain_hash, 'big')
	domain_seed = b"demo_substrate\|" + domain.encode("utf-8")

	# Create shards for this domain
	for i in range(0, vectors_per_domain, shard_size):
	vectors = []

	for j in range(shard_size):
	if i + j >= vectors_per_domain:
	break

	# Generate vector with small random offset
	global_idx = i + j
	x = center_x + normal(domain_seed, global_idx * 4, mean=0.0, std=0.1)
	y = center_y + normal(domain_seed, global_idx * 4 + 2, mean=0.0, std=0.1)

	vec = CompactVector(
	x=x,
	y=y,
	shard_id=shard_id,
	domain_hash=domain_hash_int
	)
	vectors.append(vec)

	# Compute centroid
	if vectors:
	centroid_x = sum(v.x for v in vectors) / len(vectors)
	centroid_y = sum(v.y for v in vectors) / len(vectors)
	centroid = CompactVector(centroid_x, centroid_y, shard_id, domain_hash_int)

	shard = SubstrateShard(
	shard_id=shard_id,
	domain=domain,
	centroid=centroid,
	vectors=vectors
	)

	substrate.add_shard(shard)
	shard_id += 1

	print(f"\nSubstrate created: {len(substrate.all_shards)} shards")
	return substrate


	def demo_query_performance():
	"""
	Demonstrate query performance with sharded substrate.
	"""

	print("""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ ║
	║ Substrate Sharding Demonstration ║
	║ ║
	║ Exercises deterministic sharding and cache residency over ║
	║ a large verified substrate. Illustrates how relevance ║
	║ ranking and promotion constrain active context. ║
	║ ║
	╚══════════════════════════════════════════════════════════════════════════════╝
	""")

	# Create substrate
	substrate = create_demo_substrate(num_vectors=250000, shard_size=64)

	substrate.print_stats()

	# Simulate queries from different domains
	print("\n" + "=" * 80)
	print("QUERY SIMULATION")
	print("=" * 80)

	# Biology query
	print("\nQuery 1: Biology domain (photosynthesis)")
	bio_query = CompactVector(x=0.82, y=0.78, shard_id=-1, domain_hash=0)
	relevant_shards = substrate.retrieve_relevant_shards(bio_query, top_k=8)

	print(f" Retrieved {len(relevant_shards)} relevant shards")
	print(f" L1 vectors available: {len(substrate.get_l1_vectors())}")
	print(f" Most relevant shard: {relevant_shards[0].shard_id} (domain: {relevant_shards[0].domain})")

	# Geography query
	print("\nQuery 2: Geography domain (capitals)")
	geo_query = CompactVector(x=0.48, y=0.52, shard_id=-1, domain_hash=1)
	relevant_shards = substrate.retrieve_relevant_shards(geo_query, top_k=8)

	print(f" Retrieved {len(relevant_shards)} relevant shards")
	print(f" L1 vectors available: {len(substrate.get_l1_vectors())}")
	print(f" Most relevant shard: {relevant_shards[0].shard_id} (domain: {relevant_shards[0].domain})")

	# Physics query
	print("\nQuery 3: Physics domain (mechanics)")
	phys_query = CompactVector(x=0.18, y=0.22, shard_id=-1, domain_hash=2)
	relevant_shards = substrate.retrieve_relevant_shards(phys_query, top_k=8)

	print(f" Retrieved {len(relevant_shards)} relevant shards")
	print(f" L1 vectors available: {len(substrate.get_l1_vectors())}")
	print(f" Most relevant shard: {relevant_shards[0].shard_id} (domain: {relevant_shards[0].domain})")

	# Print final stats
	substrate.print_stats()

	# Memory footprint analysis
	print("\n" + "=" * 80)
	print("MEMORY FOOTPRINT ANALYSIS")
	print("=" * 80)

	l1_size = sum(s.size_bytes for s in substrate.l1_cache.values())
	l2_size = sum(s.size_bytes for s in substrate.l2_cache.values())
	total_size = sum(s.size_bytes for s in substrate.all_shards.values())

	print(f"L1 Cache: {l1_size:,} bytes ({l1_size/1024:.2f} KB)")
	print(f" Fits in L1? {l1_size <= L1_CACHE_SIZE}")
	print(f" L1 capacity: {L1_CACHE_SIZE:,} bytes ({L1_CACHE_SIZE/1024:.2f} KB)")
	print()
	print(f"L2 Cache: {l2_size:,} bytes ({l2_size/1024:.2f} KB)")
	print(f" Fits in L2? {l2_size <= L2_CACHE_SIZE}")
	print(f" L2 capacity: {L2_CACHE_SIZE:,} bytes ({L2_CACHE_SIZE/1024:.2f} KB)")
	print()
	print(f"Total Substrate: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)")
	print()
	print(f"Cache efficiency: {(l1_size + l2_size) / total_size * 100:.2f}% in fast cache")

	print("\n" + "=" * 80)
	print("KEY INSIGHTS")
	print("=" * 80)
	print("""
	1. CACHE RESIDENCY:
	- L1 holds 1 shard (64 vectors) = most relevant for current query
	- L2 holds 8 shards (512 vectors) = domain-relevant context
	- Working set: 576 vectors in fast cache (< 10 KB in 2D)

	2. RETRIEVAL STRATEGY:
	- Deterministic domain mapping (hash-based)
	- Centroid-based relevance ranking
	- LRU eviction maintains hot shards

	3. SCALING TO 768D:
	- 768 floats × 4 bytes = 3 KB per vector
	- L1: 64 vectors × 3 KB = 192 KB (fits in L2, not L1)
	- Strategy: Reduce L1 capacity to 10-16 vectors for true L1 residence

	4. PRODUCTION OPTIMIZATIONS:
	- Use float16 (half precision) → 1.5 KB per vector
	- Quantize to int8 → 768 bytes per vector
	- Product quantization → 64-128 bytes per vector
	- With PQ: L1 can hold 256-512 vectors (16-32 KB)

	5. INFERENCE PATH:
	- Query arrives → Hash to domain → Load L2 shards
	- Rank by centroid → Promote top shard to L1
	- Inference runs on L1-resident vectors only
	- Sub-microsecond retrieval, deterministic, cache-friendly
	""")


	if __name__ == "__main__":
	demo_query_performance()