Upload folder using huggingface_hub
Browse files- overlay/configs/__init__.py +5 -0
- overlay/configs/hardware_config.py +104 -0
- overlay/configs/harness_config.py +108 -0
- overlay/configs/model_config.py +80 -0
overlay/configs/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from configs.hardware_config import HardwareConfig
|
| 2 |
+
from configs.harness_config import HarnessConfig
|
| 3 |
+
from configs.model_config import PostSemClawConfig
|
| 4 |
+
|
| 5 |
+
__all__ = ["PostSemClawConfig", "HarnessConfig", "HardwareConfig"]
|
overlay/configs/hardware_config.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hardware detection and memory budget configuration."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class HardwareConfig(BaseModel):
|
| 9 |
+
"""Auto-detected hardware configuration with memory budgets."""
|
| 10 |
+
|
| 11 |
+
gpu_name: str = Field(default="unknown", description="GPU device name")
|
| 12 |
+
gpu_memory_mb: int = Field(default=0, description="Total GPU memory in MB")
|
| 13 |
+
gpu_vram_mb: int = Field(default=0, description="Alias for gpu_memory_mb (legacy compat)")
|
| 14 |
+
compute_capability: tuple[int, int] = Field(
|
| 15 |
+
default=(0, 0), description="CUDA compute capability"
|
| 16 |
+
)
|
| 17 |
+
peak_flops: float = Field(
|
| 18 |
+
default=12.74e12, description="Peak FP32 FLOPS for MFU calculation"
|
| 19 |
+
)
|
| 20 |
+
bf16_peak_flops: float = Field(
|
| 21 |
+
default=38.1e12, description="Peak BF16 FLOPS (RTX 3060 default)"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Memory budget
|
| 25 |
+
model_budget_mb: int = Field(
|
| 26 |
+
default=1500, description="Max MB for model params + optimizer"
|
| 27 |
+
)
|
| 28 |
+
activation_budget_mb: int = Field(
|
| 29 |
+
default=3000, description="Max MB for activations"
|
| 30 |
+
)
|
| 31 |
+
overhead_mb: int = Field(
|
| 32 |
+
default=500, description="Reserved for CUDA context + PyTorch overhead"
|
| 33 |
+
)
|
| 34 |
+
max_vram_usage_pct: float = Field(
|
| 35 |
+
default=90.0, description="Max VRAM usage as % of total"
|
| 36 |
+
)
|
| 37 |
+
gradient_checkpointing: bool = Field(
|
| 38 |
+
default=False, description="Enable gradient checkpointing to save VRAM"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
@classmethod
|
| 42 |
+
def detect(cls) -> HardwareConfig:
|
| 43 |
+
"""Auto-detect hardware from current CUDA device."""
|
| 44 |
+
if not torch.cuda.is_available():
|
| 45 |
+
return cls()
|
| 46 |
+
|
| 47 |
+
device = torch.cuda.current_device()
|
| 48 |
+
props = torch.cuda.get_device_properties(device)
|
| 49 |
+
cap = (props.major, props.minor)
|
| 50 |
+
mem_mb = props.total_memory // (1024 * 1024)
|
| 51 |
+
gpu_name = props.name
|
| 52 |
+
|
| 53 |
+
# Peak FP32 FLOPS lookup by compute capability (approximate)
|
| 54 |
+
fp32_flops_table: dict[tuple[int, int], float] = {
|
| 55 |
+
(8, 6): 12.74e12, # RTX 3060
|
| 56 |
+
(8, 9): 40.09e12, # RTX 4090
|
| 57 |
+
(9, 0): 989.5e12, # H100 (BF16)
|
| 58 |
+
}
|
| 59 |
+
peak = fp32_flops_table.get(cap, 12.74e12)
|
| 60 |
+
|
| 61 |
+
# BF16 peak FLOPS lookup by GPU name substring
|
| 62 |
+
bf16_flops_table: dict[str, float] = {
|
| 63 |
+
"3060": 38.1e12,
|
| 64 |
+
"3090": 71.0e12,
|
| 65 |
+
"4090": 165.2e12,
|
| 66 |
+
"A100": 312e12,
|
| 67 |
+
"H100": 989.5e12,
|
| 68 |
+
"A10G": 70.0e12,
|
| 69 |
+
}
|
| 70 |
+
bf16_peak = 38.1e12 # default to RTX 3060
|
| 71 |
+
for key, val in bf16_flops_table.items():
|
| 72 |
+
if key in gpu_name:
|
| 73 |
+
bf16_peak = val
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
# Memory budget: leave overhead_mb for CUDA context
|
| 77 |
+
overhead = 500
|
| 78 |
+
available = mem_mb - overhead
|
| 79 |
+
model_budget = int(available * 0.3) # 30% for params + optimizer
|
| 80 |
+
activation_budget = int(available * 0.7) # 70% for activations
|
| 81 |
+
|
| 82 |
+
return cls(
|
| 83 |
+
gpu_name=gpu_name,
|
| 84 |
+
gpu_memory_mb=mem_mb,
|
| 85 |
+
gpu_vram_mb=mem_mb,
|
| 86 |
+
compute_capability=cap,
|
| 87 |
+
peak_flops=peak,
|
| 88 |
+
bf16_peak_flops=bf16_peak,
|
| 89 |
+
model_budget_mb=model_budget,
|
| 90 |
+
activation_budget_mb=activation_budget,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def suggest_batch_size(self, d_model: int, seq_len: int, n_layer: int) -> int:
|
| 94 |
+
"""Suggest batch size based on activation budget.
|
| 95 |
+
|
| 96 |
+
Uses rough estimate: per-sample activation ~= n_layer * seq_len * d_model
|
| 97 |
+
* 4 bytes * 2 (fwd + bwd).
|
| 98 |
+
"""
|
| 99 |
+
per_sample_mb = n_layer * seq_len * d_model * 4 * 2 / (1024 * 1024)
|
| 100 |
+
if per_sample_mb <= 0:
|
| 101 |
+
return 1
|
| 102 |
+
batch = max(1, int(self.activation_budget_mb / per_sample_mb))
|
| 103 |
+
# Round down to power of 2
|
| 104 |
+
return 2 ** (batch.bit_length() - 1) if batch > 1 else 1
|
overlay/configs/harness_config.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Harness configuration for HYDRA's self-evolving outer loop."""
|
| 2 |
+
from typing import Literal
|
| 3 |
+
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
|
| 6 |
+
GateThresholds = dict[str, float]
|
| 7 |
+
GateConfig = dict[str, GateThresholds]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class HarnessConfig(BaseModel):
|
| 11 |
+
"""Configuration for the HYDRA harness behavior."""
|
| 12 |
+
|
| 13 |
+
# Inner loop
|
| 14 |
+
time_budget_seconds: int = Field(
|
| 15 |
+
default=300, ge=60, description="Training time budget per experiment in seconds"
|
| 16 |
+
)
|
| 17 |
+
max_experiments: int = Field(
|
| 18 |
+
default=1000, ge=0, description="Max experiments before stopping (0=infinite)"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Meta-agent
|
| 22 |
+
meta_interval: int = Field(
|
| 23 |
+
default=20, ge=5, description="Run meta-agent every N experiments"
|
| 24 |
+
)
|
| 25 |
+
max_meta_changes: int = Field(
|
| 26 |
+
default=3, ge=1, le=10, description="Max changes per meta-iteration"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Search strategy
|
| 30 |
+
exploration_mode: Literal["conservative", "balanced", "bold"] = "balanced"
|
| 31 |
+
exploration_budget: int = Field(
|
| 32 |
+
default=5, ge=1, description="Consecutive bold experiments when stuck"
|
| 33 |
+
)
|
| 34 |
+
stuck_threshold: int = Field(
|
| 35 |
+
default=10, ge=3, description="No improvement for N experiments = stuck"
|
| 36 |
+
)
|
| 37 |
+
crash_threshold: float = Field(
|
| 38 |
+
default=0.5,
|
| 39 |
+
ge=0.1,
|
| 40 |
+
le=1.0,
|
| 41 |
+
description="Crash rate threshold for BROKEN state",
|
| 42 |
+
)
|
| 43 |
+
regression_tolerance: float = Field(
|
| 44 |
+
default=0.05,
|
| 45 |
+
ge=0,
|
| 46 |
+
le=0.2,
|
| 47 |
+
description="Max val_bpb regression from best (fraction)",
|
| 48 |
+
)
|
| 49 |
+
max_regression_pct: float = Field(
|
| 50 |
+
default=5.0, description="Max % regression from best known val_bpb"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Keep/discard criteria
|
| 54 |
+
primary_metric: str = "val_bpb"
|
| 55 |
+
secondary_metrics: GateConfig = Field(
|
| 56 |
+
default_factory=lambda: {
|
| 57 |
+
"mhc_spectral_norm": {"max": 2.0},
|
| 58 |
+
"engram_hit_rate": {"min": 0.1},
|
| 59 |
+
"factual_english_score": {"min": 0.5},
|
| 60 |
+
"instruction_following_score": {"min": 0.5},
|
| 61 |
+
"distinct_2": {"min": 0.1},
|
| 62 |
+
"repetition_rate": {"max": 0.2},
|
| 63 |
+
"hestia_quant_error": {"max": 0.05},
|
| 64 |
+
}
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Experiment execution
|
| 68 |
+
experiment_timeout: int = Field(
|
| 69 |
+
default=600, ge=300, description="Kill experiment after N seconds"
|
| 70 |
+
)
|
| 71 |
+
warmup_steps: int = Field(
|
| 72 |
+
default=10, ge=0, description="Steps to exclude from timing"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Git
|
| 76 |
+
branch_prefix: str = Field(default="autoresearch", description="Branch naming prefix")
|
| 77 |
+
results_file: str = Field(default="results.tsv", description="Experiment log file")
|
| 78 |
+
|
| 79 |
+
# Secondary metric gates (optional keep/discard criteria)
|
| 80 |
+
gate_mhc_spectral_norm: float | None = Field(
|
| 81 |
+
default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
|
| 82 |
+
)
|
| 83 |
+
gate_engram_hit_rate: float | None = Field(
|
| 84 |
+
default=None, description="Min engram_hit_rate for keep (None=disabled)"
|
| 85 |
+
)
|
| 86 |
+
gate_tps_median: float | None = Field(
|
| 87 |
+
default=None,
|
| 88 |
+
description="Min steady-state tps_median for keep (None=disabled)",
|
| 89 |
+
)
|
| 90 |
+
gate_tps_p10: float | None = Field(
|
| 91 |
+
default=None,
|
| 92 |
+
description="Min steady-state tps_p10 for keep (None=disabled)",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def to_secondary_gates(self) -> GateConfig:
|
| 96 |
+
"""Build active keep/discard gates from defaults plus gate_* overrides."""
|
| 97 |
+
gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
|
| 98 |
+
|
| 99 |
+
if self.gate_mhc_spectral_norm is not None:
|
| 100 |
+
gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
|
| 101 |
+
if self.gate_engram_hit_rate is not None:
|
| 102 |
+
gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
|
| 103 |
+
if self.gate_tps_median is not None:
|
| 104 |
+
gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
|
| 105 |
+
if self.gate_tps_p10 is not None:
|
| 106 |
+
gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
|
| 107 |
+
|
| 108 |
+
return gates
|
overlay/configs/model_config.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Post-SEM-Claw model configuration with Pydantic validation."""
|
| 2 |
+
from pydantic import BaseModel, Field, field_validator
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class PostSemClawConfig(BaseModel):
|
| 6 |
+
"""Configuration for the Post-SEM-Claw architecture.
|
| 7 |
+
|
| 8 |
+
Default values mirror the @dataclass in train.py exactly.
|
| 9 |
+
train.py is the source of truth — this file must stay in sync with it.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# Sequence
|
| 13 |
+
sequence_len: int = Field(default=2048, description="Context length (from prepare.py MAX_SEQ_LEN)")
|
| 14 |
+
vocab_size: int = Field(default=8192, description="Vocabulary size (from prepare.py VOCAB_SIZE)")
|
| 15 |
+
|
| 16 |
+
# Mamba-3 SSM
|
| 17 |
+
n_layer: int = Field(default=4, ge=1, le=48, description="Number of Mamba-3 blocks")
|
| 18 |
+
d_model: int = Field(default=256, ge=64, description="Model embedding dimension")
|
| 19 |
+
d_state: int = Field(default=64, ge=16, description="SSM state dimension")
|
| 20 |
+
headdim: int = Field(default=32, ge=16, description="SSM head dimension")
|
| 21 |
+
n_heads: int = Field(default=8, ge=1, description="Number of SSM heads (d_model // headdim)")
|
| 22 |
+
expand: int = Field(default=2, ge=1, le=4, description="Inner dim multiplier (inner_dim = expand * d_model)")
|
| 23 |
+
|
| 24 |
+
# mHC (Manifold Hyper-Connection)
|
| 25 |
+
mhc_n_streams: int = Field(default=4, ge=2, le=8, description="Number of residual streams")
|
| 26 |
+
mhc_sinkhorn_iters: int = Field(default=5, ge=1, le=100, description="Sinkhorn-Knopp iterations")
|
| 27 |
+
|
| 28 |
+
# Engram (conditional memory)
|
| 29 |
+
engram_n_columns: int = Field(default=4096, ge=256, description="Hash table columns")
|
| 30 |
+
engram_key_dim: int = Field(default=64, ge=16, description="Engram key dimension")
|
| 31 |
+
engram_layer_idx: int = Field(default=1, ge=0, description="Which layer gets engram (0-indexed)")
|
| 32 |
+
|
| 33 |
+
# Hestia QAT (disabled Phase 1, skeleton only)
|
| 34 |
+
hestia_enabled: bool = Field(default=False, description="Enable Hestia quantization")
|
| 35 |
+
hestia_bits: float = Field(default=1.58, gt=0, description="Target quantization bits (1.58 = 1.58-bit ternary)")
|
| 36 |
+
|
| 37 |
+
# SDR (bypass-only in Phase 1)
|
| 38 |
+
sdr_enabled: bool = Field(default=False, description="Enable stochastic resonance")
|
| 39 |
+
sdr_k: int = Field(default=64, ge=1, description="Top-K sparsification")
|
| 40 |
+
sdr_noise_std: float = Field(default=0.1, ge=0.0, description="SR noise standard deviation")
|
| 41 |
+
|
| 42 |
+
@field_validator("n_heads")
|
| 43 |
+
@classmethod
|
| 44 |
+
def validate_heads(cls, v: int, info: "FieldValidationInfo") -> int:
|
| 45 |
+
"""Ensure n_heads equals d_model // headdim."""
|
| 46 |
+
d_model = info.data.get("d_model", 256)
|
| 47 |
+
headdim = info.data.get("headdim", 32)
|
| 48 |
+
expected = d_model // headdim
|
| 49 |
+
if v != expected:
|
| 50 |
+
raise ValueError(
|
| 51 |
+
f"n_heads ({v}) must equal d_model // headdim ({expected})"
|
| 52 |
+
)
|
| 53 |
+
return v
|
| 54 |
+
|
| 55 |
+
def estimate_params(self) -> int:
|
| 56 |
+
"""Rough parameter count estimate based on train.py architecture."""
|
| 57 |
+
inner = self.expand * self.d_model
|
| 58 |
+
# in_proj: d_model -> inner + inner + d_state + d_state + n_heads
|
| 59 |
+
in_proj = self.d_model * (inner + inner + self.d_state + self.d_state + self.n_heads)
|
| 60 |
+
out_proj = inner * self.d_model
|
| 61 |
+
# conv1d (kernel=4, groups=inner_dim)
|
| 62 |
+
conv = inner * 4
|
| 63 |
+
# A_log, lambda_theta, D: n_heads each (3 vectors)
|
| 64 |
+
ssm_params = self.n_heads * 3
|
| 65 |
+
# bc_norm: d_state * 2 (weight + bias)
|
| 66 |
+
bc_norm = self.d_state * 2
|
| 67 |
+
per_block = in_proj + out_proj + conv + ssm_params + bc_norm
|
| 68 |
+
blocks = per_block * self.n_layer
|
| 69 |
+
|
| 70 |
+
# Embedding + lm_head (tied or untied)
|
| 71 |
+
embed = self.vocab_size * self.d_model * 2
|
| 72 |
+
|
| 73 |
+
# Engram: one instance at engram_layer_idx
|
| 74 |
+
# columns * d_model keys + d_model * engram_key_dim projection
|
| 75 |
+
engram = self.engram_n_columns * self.d_model + self.d_model * self.engram_key_dim
|
| 76 |
+
|
| 77 |
+
# mHC mixing matrices: n_layer * mhc_n_streams^2
|
| 78 |
+
mhc = self.n_layer * self.mhc_n_streams ** 2
|
| 79 |
+
|
| 80 |
+
return embed + blocks + engram + mhc
|