Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

icarus112 commited on about 12 hours ago

Commit

65cd644

verified ·

1 Parent(s): c475135

Update Feather a10g-large training runtime image

Browse files

Files changed (3) hide show

overlay/scripts/hf_boot_smoke.py +130 -0
overlay/scripts/launch_feather_hf_job.py +2 -1
overlay/subsystems/fused_sdr_project.py +32 -0

overlay/scripts/hf_boot_smoke.py CHANGED Viewed

@@ -19,6 +19,7 @@ SAFE_ENV_KEYS = [
     "FEATHER_GPU_PROFILE",
     "FEATHER_HF_FLAVOR",
     "FEATHER_RUNTIME_MODE",
     "HYDRA_RUNTIME_PROFILE",
     "HYDRA_STRICT_OPTIMAL_COMPONENTS",
     "HYDRA_USE_NEMOTRON",
@@ -33,6 +34,11 @@ SAFE_ENV_KEYS = [
     "HYDRA_HTM_FUSED",
     "HYDRA_HTM_BATCHED_FUSED",
     "HYDRA_DISABLE_FUSED_SDR_TRITON",
     "HTM_CUDA_ARCH",
     "TORCH_CUDA_ARCH_LIST",
 ]
@@ -60,6 +66,125 @@ def safe_env_summary() -> dict[str, str]:
     return {key: os.environ[key] for key in SAFE_ENV_KEYS if key in os.environ}
 def main() -> int:
     print("[boot_smoke] phase=start", flush=True)
     ensure_repo_on_path()
@@ -80,6 +205,11 @@ def main() -> int:
         print(f"[boot_smoke] torch_import_failed={type(exc).__name__}: {exc}", flush=True)
         return 2
     try:
         training = importlib.import_module("hydra.training")
         required = ["LATEST_CKPT", "PRETRAIN_FINAL_CKPT", "save_ckpt", "maybe_resume_ckpt"]

     "FEATHER_GPU_PROFILE",
     "FEATHER_HF_FLAVOR",
     "FEATHER_RUNTIME_MODE",
+    "FEATHER_HF_STRICT_RUNTIME_PREFLIGHT",
     "HYDRA_RUNTIME_PROFILE",
     "HYDRA_STRICT_OPTIMAL_COMPONENTS",
     "HYDRA_USE_NEMOTRON",
     "HYDRA_HTM_FUSED",
     "HYDRA_HTM_BATCHED_FUSED",
     "HYDRA_DISABLE_FUSED_SDR_TRITON",
+    "HYDRA_TOKEN_CACHE_GB",
+    "HYDRA_DISABLE_TOKEN_CACHE",
+    "HYDRA_HTM_STRICT_SCALE_FREE",
+    "HYDRA_HTM_REGION_POOL_SIZE",
+    "HYDRA_HTM_CHUNK_B",
     "HTM_CUDA_ARCH",
     "TORCH_CUDA_ARCH_LIST",
 ]
     return {key: os.environ[key] for key in SAFE_ENV_KEYS if key in os.environ}
+def _truthy_env(name: str) -> bool:
+    return os.environ.get(name, "0").strip().lower() in {"1", "true", "yes", "on"}
+def strict_optimal_preflight_requested() -> bool:
+    return (
+        _truthy_env("FEATHER_HF_STRICT_RUNTIME_PREFLIGHT")
+        or os.environ.get("HYDRA_STRICT_OPTIMAL_COMPONENTS", "0") == "1"
+        or os.environ.get("HYDRA_RUNTIME_PROFILE", "").strip().lower() == "optimal-strict"
+    )
+def _import_required_module(module_name: str):
+    try:
+        module = importlib.import_module(module_name)
+    except Exception as exc:
+        print(f"[strict_preflight] {module_name}=failed {type(exc).__name__}: {exc}", flush=True)
+        return None
+    print(f"[strict_preflight] {module_name}=ok", flush=True)
+    return module
+def run_strict_optimal_preflight() -> int:
+    """Fail before training if the strict-optimal A10G fast path is unavailable.
+    This is intentionally a runtime/image preflight, not a CPU fallback.  It
+    verifies the same strict fast-path surfaces that otherwise fail only after a
+    paid trainer has finished build/provenance setup.
+    """
+    print("[strict_preflight] phase=start", flush=True)
+    failures: list[str] = []
+    torch = _import_required_module("torch")
+    if torch is None:
+        failures.append("torch_import")
+    else:
+        try:
+            cuda_available = bool(torch.cuda.is_available())
+            device_count = int(torch.cuda.device_count()) if cuda_available else 0
+            device_name = torch.cuda.get_device_name(0) if cuda_available and device_count else "<none>"
+            if not cuda_available or device_count < 1:
+                failures.append("torch_cuda")
+                print(
+                    f"[strict_preflight] torch_cuda=failed cuda_available={int(cuda_available)} device_count={device_count}",
+                    flush=True,
+                )
+            else:
+                print(
+                    f"[strict_preflight] torch_cuda=ok device_count={device_count} device0={device_name}",
+                    flush=True,
+                )
+        except Exception as exc:
+            failures.append("torch_cuda")
+            print(f"[strict_preflight] torch_cuda=failed {type(exc).__name__}: {exc}", flush=True)
+    triton = _import_required_module("triton")
+    if triton is None:
+        failures.append("triton_import")
+    else:
+        try:
+            active = triton.runtime.driver.active
+            device = active.get_current_device()
+            print(f"[strict_preflight] triton_driver=ok device={device}", flush=True)
+        except Exception as exc:
+            failures.append("triton_driver")
+            print(f"[strict_preflight] triton_driver=failed {type(exc).__name__}: {exc}", flush=True)
+    mamba = _import_required_module("mamba_ssm")
+    if mamba is None or not hasattr(mamba, "Mamba3"):
+        failures.append("mamba")
+        print("[strict_preflight] mamba=missing Mamba3", flush=True)
+    else:
+        print("[strict_preflight] mamba=ok Mamba3=True", flush=True)
+    fused_sdr = None
+    for module_name in ("subsystems.fused_sdr_project",):
+        try:
+            module = importlib.import_module(module_name)
+        except Exception as exc:
+            print(f"[strict_preflight] fused_sdr_candidate={module_name} failed {type(exc).__name__}: {exc}", flush=True)
+            continue
+        if hasattr(module, "FusedSDRProject"):
+            fused_sdr = (module_name, module)
+            break
+    if fused_sdr is None:
+        failures.append("fused_sdr")
+        print("[strict_preflight] fused_sdr=missing FusedSDRProject", flush=True)
+    else:
+        module_name, _module = fused_sdr
+        print(f"[strict_preflight] fused_sdr=ok module={module_name}", flush=True)
+    htm = _import_required_module("htm_rust")
+    if htm is None:
+        failures.append("htm_rust")
+    else:
+        has_region = hasattr(htm, "HTMRegion")
+        has_gpu = hasattr(htm, "HTMRegionGpu")
+        has_fused = hasattr(htm, "step_batch_fused_cuda")
+        if not (has_region and has_gpu and has_fused):
+            failures.append("htm_rust")
+            print(
+                "[strict_preflight] htm_rust=failed "
+                f"HTMRegion={has_region} HTMRegionGpu={has_gpu} step_batch_fused_cuda={has_fused}",
+                flush=True,
+            )
+        else:
+            print(
+                "[strict_preflight] htm_rust=ok "
+                f"HTMRegion={has_region} HTMRegionGpu={has_gpu} step_batch_fused_cuda={has_fused}",
+                flush=True,
+            )
+    if failures:
+        print(f"[strict_preflight] phase=failed failures={','.join(failures)}", flush=True)
+        return 5
+    print("[strict_preflight] phase=done", flush=True)
+    return 0
 def main() -> int:
     print("[boot_smoke] phase=start", flush=True)
     ensure_repo_on_path()
         print(f"[boot_smoke] torch_import_failed={type(exc).__name__}: {exc}", flush=True)
         return 2
+    if strict_optimal_preflight_requested():
+        rc = run_strict_optimal_preflight()
+        if rc != 0:
+            return rc
     try:
         training = importlib.import_module("hydra.training")
         required = ["LATEST_CKPT", "PRETRAIN_FINAL_CKPT", "save_ckpt", "maybe_resume_ckpt"]

overlay/scripts/launch_feather_hf_job.py CHANGED Viewed

@@ -157,7 +157,7 @@ def build_job_command() -> list[str]:
     override = os.environ.get('FEATHER_HF_JOB_COMMAND')
     if override:
         return shlex.split(override)
-    if _truthy_env('FEATHER_HF_BOOT_SMOKE'):
         return ['python', '/app/scripts/hf_boot_smoke.py']
     if _truthy_env('FEATHER_HF_CHECKPOINT_EVAL'):
         return ['python', '/app/scripts/hf_checkpoint_eval.py']
@@ -527,6 +527,7 @@ def build_dry_run_manifest(
         'receipts_required': {
             'space_stage': 'verify before paid launch',
             'duplicate_active_job_check': '0 active Feather A10G jobs before launch',
             'htm_gpu': 'HTMRegionGpu=True and no CPU fallback for faithful rows',
             'profile_forward': '0 for TPS rows; 1 only for attribution rows',
             'graph_breaks': 'TORCH_LOGS=graph_breaks attached for compile probes',

     override = os.environ.get('FEATHER_HF_JOB_COMMAND')
     if override:
         return shlex.split(override)
+    if _truthy_env('FEATHER_HF_BOOT_SMOKE') or _truthy_env('FEATHER_HF_STRICT_RUNTIME_PREFLIGHT'):
         return ['python', '/app/scripts/hf_boot_smoke.py']
     if _truthy_env('FEATHER_HF_CHECKPOINT_EVAL'):
         return ['python', '/app/scripts/hf_checkpoint_eval.py']
         'receipts_required': {
             'space_stage': 'verify before paid launch',
             'duplicate_active_job_check': '0 active Feather A10G jobs before launch',
+            'strict_runtime_preflight': 'for optimal-strict: run FEATHER_HF_STRICT_RUNTIME_PREFLIGHT=1 and require torch_cuda/triton_driver/mamba/fused_sdr/htm_rust ok before train',
             'htm_gpu': 'HTMRegionGpu=True and no CPU fallback for faithful rows',
             'profile_forward': '0 for TPS rows; 1 only for attribution rows',
             'graph_breaks': 'TORCH_LOGS=graph_breaks attached for compile probes',

overlay/subsystems/fused_sdr_project.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Strict-optimal FusedSDRProject import surface.
+The production Triton implementation lives in ``archive.fused_sdr_project`` after
+PR #31's source reorganization, but strict-optimal HF runtimes still need a
+stable ``subsystems.fused_sdr_project`` module path.  Keep this shim thin so the
+preflight and model path verify the intended fast component without copying the
+kernel body.
+"""
+from __future__ import annotations
+import os
+from archive.fused_sdr_project import FusedSDRProject as _ArchiveFusedSDRProject
+class FusedSDRProject:
+    """Compatibility wrapper that preserves strict-optimal fail-closed guards."""
+    @staticmethod
+    def apply(active_indices, token_ids, sdr_proj_weight, delta_u, delta_v):
+        if (
+            os.environ.get("HYDRA_STRICT_OPTIMAL_COMPONENTS", "0") == "1"
+            and os.environ.get("HYDRA_DISABLE_FUSED_SDR_TRITON", "0") == "1"
+        ):
+            raise RuntimeError(
+                "HYDRA_STRICT_OPTIMAL_COMPONENTS=1 requires FusedSDRProject/Triton; "
+                "HYDRA_DISABLE_FUSED_SDR_TRITON=1 is not allowed."
+            )
+        return _ArchiveFusedSDRProject.apply(active_indices, token_ids, sdr_proj_weight, delta_u, delta_v)
+__all__ = ["FusedSDRProject"]