icarus112 commited on
Commit
2d94172
·
verified ·
1 Parent(s): 22741d9

Upload folder using huggingface_hub

Browse files
overlay/configs/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from configs.hardware_config import HardwareConfig
2
+ from configs.harness_config import HarnessConfig
3
+ from configs.model_config import PostSemClawConfig
4
+
5
+ __all__ = ["PostSemClawConfig", "HarnessConfig", "HardwareConfig"]
overlay/configs/hardware_config.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hardware detection and memory budget configuration."""
2
+ from __future__ import annotations
3
+
4
+ import torch
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class HardwareConfig(BaseModel):
9
+ """Auto-detected hardware configuration with memory budgets."""
10
+
11
+ gpu_name: str = Field(default="unknown", description="GPU device name")
12
+ gpu_memory_mb: int = Field(default=0, description="Total GPU memory in MB")
13
+ gpu_vram_mb: int = Field(default=0, description="Alias for gpu_memory_mb (legacy compat)")
14
+ compute_capability: tuple[int, int] = Field(
15
+ default=(0, 0), description="CUDA compute capability"
16
+ )
17
+ peak_flops: float = Field(
18
+ default=12.74e12, description="Peak FP32 FLOPS for MFU calculation"
19
+ )
20
+ bf16_peak_flops: float = Field(
21
+ default=38.1e12, description="Peak BF16 FLOPS (RTX 3060 default)"
22
+ )
23
+
24
+ # Memory budget
25
+ model_budget_mb: int = Field(
26
+ default=1500, description="Max MB for model params + optimizer"
27
+ )
28
+ activation_budget_mb: int = Field(
29
+ default=3000, description="Max MB for activations"
30
+ )
31
+ overhead_mb: int = Field(
32
+ default=500, description="Reserved for CUDA context + PyTorch overhead"
33
+ )
34
+ max_vram_usage_pct: float = Field(
35
+ default=90.0, description="Max VRAM usage as % of total"
36
+ )
37
+ gradient_checkpointing: bool = Field(
38
+ default=False, description="Enable gradient checkpointing to save VRAM"
39
+ )
40
+
41
+ @classmethod
42
+ def detect(cls) -> HardwareConfig:
43
+ """Auto-detect hardware from current CUDA device."""
44
+ if not torch.cuda.is_available():
45
+ return cls()
46
+
47
+ device = torch.cuda.current_device()
48
+ props = torch.cuda.get_device_properties(device)
49
+ cap = (props.major, props.minor)
50
+ mem_mb = props.total_memory // (1024 * 1024)
51
+ gpu_name = props.name
52
+
53
+ # Peak FP32 FLOPS lookup by compute capability (approximate)
54
+ fp32_flops_table: dict[tuple[int, int], float] = {
55
+ (8, 6): 12.74e12, # RTX 3060
56
+ (8, 9): 40.09e12, # RTX 4090
57
+ (9, 0): 989.5e12, # H100 (BF16)
58
+ }
59
+ peak = fp32_flops_table.get(cap, 12.74e12)
60
+
61
+ # BF16 peak FLOPS lookup by GPU name substring
62
+ bf16_flops_table: dict[str, float] = {
63
+ "3060": 38.1e12,
64
+ "3090": 71.0e12,
65
+ "4090": 165.2e12,
66
+ "A100": 312e12,
67
+ "H100": 989.5e12,
68
+ "A10G": 70.0e12,
69
+ }
70
+ bf16_peak = 38.1e12 # default to RTX 3060
71
+ for key, val in bf16_flops_table.items():
72
+ if key in gpu_name:
73
+ bf16_peak = val
74
+ break
75
+
76
+ # Memory budget: leave overhead_mb for CUDA context
77
+ overhead = 500
78
+ available = mem_mb - overhead
79
+ model_budget = int(available * 0.3) # 30% for params + optimizer
80
+ activation_budget = int(available * 0.7) # 70% for activations
81
+
82
+ return cls(
83
+ gpu_name=gpu_name,
84
+ gpu_memory_mb=mem_mb,
85
+ gpu_vram_mb=mem_mb,
86
+ compute_capability=cap,
87
+ peak_flops=peak,
88
+ bf16_peak_flops=bf16_peak,
89
+ model_budget_mb=model_budget,
90
+ activation_budget_mb=activation_budget,
91
+ )
92
+
93
+ def suggest_batch_size(self, d_model: int, seq_len: int, n_layer: int) -> int:
94
+ """Suggest batch size based on activation budget.
95
+
96
+ Uses rough estimate: per-sample activation ~= n_layer * seq_len * d_model
97
+ * 4 bytes * 2 (fwd + bwd).
98
+ """
99
+ per_sample_mb = n_layer * seq_len * d_model * 4 * 2 / (1024 * 1024)
100
+ if per_sample_mb <= 0:
101
+ return 1
102
+ batch = max(1, int(self.activation_budget_mb / per_sample_mb))
103
+ # Round down to power of 2
104
+ return 2 ** (batch.bit_length() - 1) if batch > 1 else 1
overlay/configs/harness_config.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Harness configuration for HYDRA's self-evolving outer loop."""
2
+ from typing import Literal
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ GateThresholds = dict[str, float]
7
+ GateConfig = dict[str, GateThresholds]
8
+
9
+
10
+ class HarnessConfig(BaseModel):
11
+ """Configuration for the HYDRA harness behavior."""
12
+
13
+ # Inner loop
14
+ time_budget_seconds: int = Field(
15
+ default=300, ge=60, description="Training time budget per experiment in seconds"
16
+ )
17
+ max_experiments: int = Field(
18
+ default=1000, ge=0, description="Max experiments before stopping (0=infinite)"
19
+ )
20
+
21
+ # Meta-agent
22
+ meta_interval: int = Field(
23
+ default=20, ge=5, description="Run meta-agent every N experiments"
24
+ )
25
+ max_meta_changes: int = Field(
26
+ default=3, ge=1, le=10, description="Max changes per meta-iteration"
27
+ )
28
+
29
+ # Search strategy
30
+ exploration_mode: Literal["conservative", "balanced", "bold"] = "balanced"
31
+ exploration_budget: int = Field(
32
+ default=5, ge=1, description="Consecutive bold experiments when stuck"
33
+ )
34
+ stuck_threshold: int = Field(
35
+ default=10, ge=3, description="No improvement for N experiments = stuck"
36
+ )
37
+ crash_threshold: float = Field(
38
+ default=0.5,
39
+ ge=0.1,
40
+ le=1.0,
41
+ description="Crash rate threshold for BROKEN state",
42
+ )
43
+ regression_tolerance: float = Field(
44
+ default=0.05,
45
+ ge=0,
46
+ le=0.2,
47
+ description="Max val_bpb regression from best (fraction)",
48
+ )
49
+ max_regression_pct: float = Field(
50
+ default=5.0, description="Max % regression from best known val_bpb"
51
+ )
52
+
53
+ # Keep/discard criteria
54
+ primary_metric: str = "val_bpb"
55
+ secondary_metrics: GateConfig = Field(
56
+ default_factory=lambda: {
57
+ "mhc_spectral_norm": {"max": 2.0},
58
+ "engram_hit_rate": {"min": 0.1},
59
+ "factual_english_score": {"min": 0.5},
60
+ "instruction_following_score": {"min": 0.5},
61
+ "distinct_2": {"min": 0.1},
62
+ "repetition_rate": {"max": 0.2},
63
+ "hestia_quant_error": {"max": 0.05},
64
+ }
65
+ )
66
+
67
+ # Experiment execution
68
+ experiment_timeout: int = Field(
69
+ default=600, ge=300, description="Kill experiment after N seconds"
70
+ )
71
+ warmup_steps: int = Field(
72
+ default=10, ge=0, description="Steps to exclude from timing"
73
+ )
74
+
75
+ # Git
76
+ branch_prefix: str = Field(default="autoresearch", description="Branch naming prefix")
77
+ results_file: str = Field(default="results.tsv", description="Experiment log file")
78
+
79
+ # Secondary metric gates (optional keep/discard criteria)
80
+ gate_mhc_spectral_norm: float | None = Field(
81
+ default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
82
+ )
83
+ gate_engram_hit_rate: float | None = Field(
84
+ default=None, description="Min engram_hit_rate for keep (None=disabled)"
85
+ )
86
+ gate_tps_median: float | None = Field(
87
+ default=None,
88
+ description="Min steady-state tps_median for keep (None=disabled)",
89
+ )
90
+ gate_tps_p10: float | None = Field(
91
+ default=None,
92
+ description="Min steady-state tps_p10 for keep (None=disabled)",
93
+ )
94
+
95
+ def to_secondary_gates(self) -> GateConfig:
96
+ """Build active keep/discard gates from defaults plus gate_* overrides."""
97
+ gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
98
+
99
+ if self.gate_mhc_spectral_norm is not None:
100
+ gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
101
+ if self.gate_engram_hit_rate is not None:
102
+ gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
103
+ if self.gate_tps_median is not None:
104
+ gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
105
+ if self.gate_tps_p10 is not None:
106
+ gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
107
+
108
+ return gates
overlay/configs/model_config.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Post-SEM-Claw model configuration with Pydantic validation."""
2
+ from pydantic import BaseModel, Field, field_validator
3
+
4
+
5
+ class PostSemClawConfig(BaseModel):
6
+ """Configuration for the Post-SEM-Claw architecture.
7
+
8
+ Default values mirror the @dataclass in train.py exactly.
9
+ train.py is the source of truth — this file must stay in sync with it.
10
+ """
11
+
12
+ # Sequence
13
+ sequence_len: int = Field(default=2048, description="Context length (from prepare.py MAX_SEQ_LEN)")
14
+ vocab_size: int = Field(default=8192, description="Vocabulary size (from prepare.py VOCAB_SIZE)")
15
+
16
+ # Mamba-3 SSM
17
+ n_layer: int = Field(default=4, ge=1, le=48, description="Number of Mamba-3 blocks")
18
+ d_model: int = Field(default=256, ge=64, description="Model embedding dimension")
19
+ d_state: int = Field(default=64, ge=16, description="SSM state dimension")
20
+ headdim: int = Field(default=32, ge=16, description="SSM head dimension")
21
+ n_heads: int = Field(default=8, ge=1, description="Number of SSM heads (d_model // headdim)")
22
+ expand: int = Field(default=2, ge=1, le=4, description="Inner dim multiplier (inner_dim = expand * d_model)")
23
+
24
+ # mHC (Manifold Hyper-Connection)
25
+ mhc_n_streams: int = Field(default=4, ge=2, le=8, description="Number of residual streams")
26
+ mhc_sinkhorn_iters: int = Field(default=5, ge=1, le=100, description="Sinkhorn-Knopp iterations")
27
+
28
+ # Engram (conditional memory)
29
+ engram_n_columns: int = Field(default=4096, ge=256, description="Hash table columns")
30
+ engram_key_dim: int = Field(default=64, ge=16, description="Engram key dimension")
31
+ engram_layer_idx: int = Field(default=1, ge=0, description="Which layer gets engram (0-indexed)")
32
+
33
+ # Hestia QAT (disabled Phase 1, skeleton only)
34
+ hestia_enabled: bool = Field(default=False, description="Enable Hestia quantization")
35
+ hestia_bits: float = Field(default=1.58, gt=0, description="Target quantization bits (1.58 = 1.58-bit ternary)")
36
+
37
+ # SDR (bypass-only in Phase 1)
38
+ sdr_enabled: bool = Field(default=False, description="Enable stochastic resonance")
39
+ sdr_k: int = Field(default=64, ge=1, description="Top-K sparsification")
40
+ sdr_noise_std: float = Field(default=0.1, ge=0.0, description="SR noise standard deviation")
41
+
42
+ @field_validator("n_heads")
43
+ @classmethod
44
+ def validate_heads(cls, v: int, info: "FieldValidationInfo") -> int:
45
+ """Ensure n_heads equals d_model // headdim."""
46
+ d_model = info.data.get("d_model", 256)
47
+ headdim = info.data.get("headdim", 32)
48
+ expected = d_model // headdim
49
+ if v != expected:
50
+ raise ValueError(
51
+ f"n_heads ({v}) must equal d_model // headdim ({expected})"
52
+ )
53
+ return v
54
+
55
+ def estimate_params(self) -> int:
56
+ """Rough parameter count estimate based on train.py architecture."""
57
+ inner = self.expand * self.d_model
58
+ # in_proj: d_model -> inner + inner + d_state + d_state + n_heads
59
+ in_proj = self.d_model * (inner + inner + self.d_state + self.d_state + self.n_heads)
60
+ out_proj = inner * self.d_model
61
+ # conv1d (kernel=4, groups=inner_dim)
62
+ conv = inner * 4
63
+ # A_log, lambda_theta, D: n_heads each (3 vectors)
64
+ ssm_params = self.n_heads * 3
65
+ # bc_norm: d_state * 2 (weight + bias)
66
+ bc_norm = self.d_state * 2
67
+ per_block = in_proj + out_proj + conv + ssm_params + bc_norm
68
+ blocks = per_block * self.n_layer
69
+
70
+ # Embedding + lm_head (tied or untied)
71
+ embed = self.vocab_size * self.d_model * 2
72
+
73
+ # Engram: one instance at engram_layer_idx
74
+ # columns * d_model keys + d_model * engram_key_dim projection
75
+ engram = self.engram_n_columns * self.d_model + self.d_model * self.engram_key_dim
76
+
77
+ # mHC mixing matrices: n_layer * mhc_n_streams^2
78
+ mhc = self.n_layer * self.mhc_n_streams ** 2
79
+
80
+ return embed + blocks + engram + mhc