Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

icarus112 commited on 14 days ago

Commit

2d94172

verified ·

1 Parent(s): 22741d9

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

overlay/configs/__init__.py +5 -0
overlay/configs/hardware_config.py +104 -0
overlay/configs/harness_config.py +108 -0
overlay/configs/model_config.py +80 -0

overlay/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from configs.hardware_config import HardwareConfig
+from configs.harness_config import HarnessConfig
+from configs.model_config import PostSemClawConfig
+__all__ = ["PostSemClawConfig", "HarnessConfig", "HardwareConfig"]

overlay/configs/hardware_config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Hardware detection and memory budget configuration."""
+from __future__ import annotations
+import torch
+from pydantic import BaseModel, Field
+class HardwareConfig(BaseModel):
+    """Auto-detected hardware configuration with memory budgets."""
+    gpu_name: str = Field(default="unknown", description="GPU device name")
+    gpu_memory_mb: int = Field(default=0, description="Total GPU memory in MB")
+    gpu_vram_mb: int = Field(default=0, description="Alias for gpu_memory_mb (legacy compat)")
+    compute_capability: tuple[int, int] = Field(
+        default=(0, 0), description="CUDA compute capability"
+    )
+    peak_flops: float = Field(
+        default=12.74e12, description="Peak FP32 FLOPS for MFU calculation"
+    )
+    bf16_peak_flops: float = Field(
+        default=38.1e12, description="Peak BF16 FLOPS (RTX 3060 default)"
+    )
+    # Memory budget
+    model_budget_mb: int = Field(
+        default=1500, description="Max MB for model params + optimizer"
+    )
+    activation_budget_mb: int = Field(
+        default=3000, description="Max MB for activations"
+    )
+    overhead_mb: int = Field(
+        default=500, description="Reserved for CUDA context + PyTorch overhead"
+    )
+    max_vram_usage_pct: float = Field(
+        default=90.0, description="Max VRAM usage as % of total"
+    )
+    gradient_checkpointing: bool = Field(
+        default=False, description="Enable gradient checkpointing to save VRAM"
+    )
+    @classmethod
+    def detect(cls) -> HardwareConfig:
+        """Auto-detect hardware from current CUDA device."""
+        if not torch.cuda.is_available():
+            return cls()
+        device = torch.cuda.current_device()
+        props = torch.cuda.get_device_properties(device)
+        cap = (props.major, props.minor)
+        mem_mb = props.total_memory // (1024 * 1024)
+        gpu_name = props.name
+        # Peak FP32 FLOPS lookup by compute capability (approximate)
+        fp32_flops_table: dict[tuple[int, int], float] = {
+            (8, 6): 12.74e12,  # RTX 3060
+            (8, 9): 40.09e12,  # RTX 4090
+            (9, 0): 989.5e12,  # H100 (BF16)
+        }
+        peak = fp32_flops_table.get(cap, 12.74e12)
+        # BF16 peak FLOPS lookup by GPU name substring
+        bf16_flops_table: dict[str, float] = {
+            "3060": 38.1e12,
+            "3090": 71.0e12,
+            "4090": 165.2e12,
+            "A100": 312e12,
+            "H100": 989.5e12,
+            "A10G": 70.0e12,
+        }
+        bf16_peak = 38.1e12  # default to RTX 3060
+        for key, val in bf16_flops_table.items():
+            if key in gpu_name:
+                bf16_peak = val
+                break
+        # Memory budget: leave overhead_mb for CUDA context
+        overhead = 500
+        available = mem_mb - overhead
+        model_budget = int(available * 0.3)      # 30% for params + optimizer
+        activation_budget = int(available * 0.7)  # 70% for activations
+        return cls(
+            gpu_name=gpu_name,
+            gpu_memory_mb=mem_mb,
+            gpu_vram_mb=mem_mb,
+            compute_capability=cap,
+            peak_flops=peak,
+            bf16_peak_flops=bf16_peak,
+            model_budget_mb=model_budget,
+            activation_budget_mb=activation_budget,
+        )
+    def suggest_batch_size(self, d_model: int, seq_len: int, n_layer: int) -> int:
+        """Suggest batch size based on activation budget.
+        Uses rough estimate: per-sample activation ~= n_layer * seq_len * d_model
+        * 4 bytes * 2 (fwd + bwd).
+        """
+        per_sample_mb = n_layer * seq_len * d_model * 4 * 2 / (1024 * 1024)
+        if per_sample_mb <= 0:
+            return 1
+        batch = max(1, int(self.activation_budget_mb / per_sample_mb))
+        # Round down to power of 2
+        return 2 ** (batch.bit_length() - 1) if batch > 1 else 1

overlay/configs/harness_config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Harness configuration for HYDRA's self-evolving outer loop."""
+from typing import Literal
+from pydantic import BaseModel, Field
+GateThresholds = dict[str, float]
+GateConfig = dict[str, GateThresholds]
+class HarnessConfig(BaseModel):
+    """Configuration for the HYDRA harness behavior."""
+    # Inner loop
+    time_budget_seconds: int = Field(
+        default=300, ge=60, description="Training time budget per experiment in seconds"
+    )
+    max_experiments: int = Field(
+        default=1000, ge=0, description="Max experiments before stopping (0=infinite)"
+    )
+    # Meta-agent
+    meta_interval: int = Field(
+        default=20, ge=5, description="Run meta-agent every N experiments"
+    )
+    max_meta_changes: int = Field(
+        default=3, ge=1, le=10, description="Max changes per meta-iteration"
+    )
+    # Search strategy
+    exploration_mode: Literal["conservative", "balanced", "bold"] = "balanced"
+    exploration_budget: int = Field(
+        default=5, ge=1, description="Consecutive bold experiments when stuck"
+    )
+    stuck_threshold: int = Field(
+        default=10, ge=3, description="No improvement for N experiments = stuck"
+    )
+    crash_threshold: float = Field(
+        default=0.5,
+        ge=0.1,
+        le=1.0,
+        description="Crash rate threshold for BROKEN state",
+    )
+    regression_tolerance: float = Field(
+        default=0.05,
+        ge=0,
+        le=0.2,
+        description="Max val_bpb regression from best (fraction)",
+    )
+    max_regression_pct: float = Field(
+        default=5.0, description="Max % regression from best known val_bpb"
+    )
+    # Keep/discard criteria
+    primary_metric: str = "val_bpb"
+    secondary_metrics: GateConfig = Field(
+        default_factory=lambda: {
+            "mhc_spectral_norm": {"max": 2.0},
+            "engram_hit_rate": {"min": 0.1},
+            "factual_english_score": {"min": 0.5},
+            "instruction_following_score": {"min": 0.5},
+            "distinct_2": {"min": 0.1},
+            "repetition_rate": {"max": 0.2},
+            "hestia_quant_error": {"max": 0.05},
+        }
+    )
+    # Experiment execution
+    experiment_timeout: int = Field(
+        default=600, ge=300, description="Kill experiment after N seconds"
+    )
+    warmup_steps: int = Field(
+        default=10, ge=0, description="Steps to exclude from timing"
+    )
+    # Git
+    branch_prefix: str = Field(default="autoresearch", description="Branch naming prefix")
+    results_file: str = Field(default="results.tsv", description="Experiment log file")
+    # Secondary metric gates (optional keep/discard criteria)
+    gate_mhc_spectral_norm: float | None = Field(
+        default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
+    )
+    gate_engram_hit_rate: float | None = Field(
+        default=None, description="Min engram_hit_rate for keep (None=disabled)"
+    )
+    gate_tps_median: float | None = Field(
+        default=None,
+        description="Min steady-state tps_median for keep (None=disabled)",
+    )
+    gate_tps_p10: float | None = Field(
+        default=None,
+        description="Min steady-state tps_p10 for keep (None=disabled)",
+    )
+    def to_secondary_gates(self) -> GateConfig:
+        """Build active keep/discard gates from defaults plus gate_* overrides."""
+        gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
+        if self.gate_mhc_spectral_norm is not None:
+            gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
+        if self.gate_engram_hit_rate is not None:
+            gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
+        if self.gate_tps_median is not None:
+            gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
+        if self.gate_tps_p10 is not None:
+            gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
+        return gates

overlay/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Post-SEM-Claw model configuration with Pydantic validation."""
+from pydantic import BaseModel, Field, field_validator
+class PostSemClawConfig(BaseModel):
+    """Configuration for the Post-SEM-Claw architecture.
+    Default values mirror the @dataclass in train.py exactly.
+    train.py is the source of truth — this file must stay in sync with it.
+    """
+    # Sequence
+    sequence_len: int = Field(default=2048, description="Context length (from prepare.py MAX_SEQ_LEN)")
+    vocab_size: int = Field(default=8192, description="Vocabulary size (from prepare.py VOCAB_SIZE)")
+    # Mamba-3 SSM
+    n_layer: int = Field(default=4, ge=1, le=48, description="Number of Mamba-3 blocks")
+    d_model: int = Field(default=256, ge=64, description="Model embedding dimension")
+    d_state: int = Field(default=64, ge=16, description="SSM state dimension")
+    headdim: int = Field(default=32, ge=16, description="SSM head dimension")
+    n_heads: int = Field(default=8, ge=1, description="Number of SSM heads (d_model // headdim)")
+    expand: int = Field(default=2, ge=1, le=4, description="Inner dim multiplier (inner_dim = expand * d_model)")
+    # mHC (Manifold Hyper-Connection)
+    mhc_n_streams: int = Field(default=4, ge=2, le=8, description="Number of residual streams")
+    mhc_sinkhorn_iters: int = Field(default=5, ge=1, le=100, description="Sinkhorn-Knopp iterations")
+    # Engram (conditional memory)
+    engram_n_columns: int = Field(default=4096, ge=256, description="Hash table columns")
+    engram_key_dim: int = Field(default=64, ge=16, description="Engram key dimension")
+    engram_layer_idx: int = Field(default=1, ge=0, description="Which layer gets engram (0-indexed)")
+    # Hestia QAT (disabled Phase 1, skeleton only)
+    hestia_enabled: bool = Field(default=False, description="Enable Hestia quantization")
+    hestia_bits: float = Field(default=1.58, gt=0, description="Target quantization bits (1.58 = 1.58-bit ternary)")
+    # SDR (bypass-only in Phase 1)
+    sdr_enabled: bool = Field(default=False, description="Enable stochastic resonance")
+    sdr_k: int = Field(default=64, ge=1, description="Top-K sparsification")
+    sdr_noise_std: float = Field(default=0.1, ge=0.0, description="SR noise standard deviation")
+    @field_validator("n_heads")
+    @classmethod
+    def validate_heads(cls, v: int, info: "FieldValidationInfo") -> int:
+        """Ensure n_heads equals d_model // headdim."""
+        d_model = info.data.get("d_model", 256)
+        headdim = info.data.get("headdim", 32)
+        expected = d_model // headdim
+        if v != expected:
+            raise ValueError(
+                f"n_heads ({v}) must equal d_model // headdim ({expected})"
+            )
+        return v
+    def estimate_params(self) -> int:
+        """Rough parameter count estimate based on train.py architecture."""
+        inner = self.expand * self.d_model
+        # in_proj: d_model -> inner + inner + d_state + d_state + n_heads
+        in_proj = self.d_model * (inner + inner + self.d_state + self.d_state + self.n_heads)
+        out_proj = inner * self.d_model
+        # conv1d (kernel=4, groups=inner_dim)
+        conv = inner * 4
+        # A_log, lambda_theta, D: n_heads each (3 vectors)
+        ssm_params = self.n_heads * 3
+        # bc_norm: d_state * 2 (weight + bias)
+        bc_norm = self.d_state * 2
+        per_block = in_proj + out_proj + conv + ssm_params + bc_norm
+        blocks = per_block * self.n_layer
+        # Embedding + lm_head (tied or untied)
+        embed = self.vocab_size * self.d_model * 2
+        # Engram: one instance at engram_layer_idx
+        # columns * d_model keys + d_model * engram_key_dim projection
+        engram = self.engram_n_columns * self.d_model + self.d_model * self.engram_key_dim
+        # mHC mixing matrices: n_layer * mhc_n_streams^2
+        mhc = self.n_layer * self.mhc_n_streams ** 2
+        return embed + blocks + engram + mhc