icarus112 commited on
Commit
65cd644
·
verified ·
1 Parent(s): c475135

Update Feather a10g-large training runtime image

Browse files
overlay/scripts/hf_boot_smoke.py CHANGED
@@ -19,6 +19,7 @@ SAFE_ENV_KEYS = [
19
  "FEATHER_GPU_PROFILE",
20
  "FEATHER_HF_FLAVOR",
21
  "FEATHER_RUNTIME_MODE",
 
22
  "HYDRA_RUNTIME_PROFILE",
23
  "HYDRA_STRICT_OPTIMAL_COMPONENTS",
24
  "HYDRA_USE_NEMOTRON",
@@ -33,6 +34,11 @@ SAFE_ENV_KEYS = [
33
  "HYDRA_HTM_FUSED",
34
  "HYDRA_HTM_BATCHED_FUSED",
35
  "HYDRA_DISABLE_FUSED_SDR_TRITON",
 
 
 
 
 
36
  "HTM_CUDA_ARCH",
37
  "TORCH_CUDA_ARCH_LIST",
38
  ]
@@ -60,6 +66,125 @@ def safe_env_summary() -> dict[str, str]:
60
  return {key: os.environ[key] for key in SAFE_ENV_KEYS if key in os.environ}
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def main() -> int:
64
  print("[boot_smoke] phase=start", flush=True)
65
  ensure_repo_on_path()
@@ -80,6 +205,11 @@ def main() -> int:
80
  print(f"[boot_smoke] torch_import_failed={type(exc).__name__}: {exc}", flush=True)
81
  return 2
82
 
 
 
 
 
 
83
  try:
84
  training = importlib.import_module("hydra.training")
85
  required = ["LATEST_CKPT", "PRETRAIN_FINAL_CKPT", "save_ckpt", "maybe_resume_ckpt"]
 
19
  "FEATHER_GPU_PROFILE",
20
  "FEATHER_HF_FLAVOR",
21
  "FEATHER_RUNTIME_MODE",
22
+ "FEATHER_HF_STRICT_RUNTIME_PREFLIGHT",
23
  "HYDRA_RUNTIME_PROFILE",
24
  "HYDRA_STRICT_OPTIMAL_COMPONENTS",
25
  "HYDRA_USE_NEMOTRON",
 
34
  "HYDRA_HTM_FUSED",
35
  "HYDRA_HTM_BATCHED_FUSED",
36
  "HYDRA_DISABLE_FUSED_SDR_TRITON",
37
+ "HYDRA_TOKEN_CACHE_GB",
38
+ "HYDRA_DISABLE_TOKEN_CACHE",
39
+ "HYDRA_HTM_STRICT_SCALE_FREE",
40
+ "HYDRA_HTM_REGION_POOL_SIZE",
41
+ "HYDRA_HTM_CHUNK_B",
42
  "HTM_CUDA_ARCH",
43
  "TORCH_CUDA_ARCH_LIST",
44
  ]
 
66
  return {key: os.environ[key] for key in SAFE_ENV_KEYS if key in os.environ}
67
 
68
 
69
+ def _truthy_env(name: str) -> bool:
70
+ return os.environ.get(name, "0").strip().lower() in {"1", "true", "yes", "on"}
71
+
72
+
73
+ def strict_optimal_preflight_requested() -> bool:
74
+ return (
75
+ _truthy_env("FEATHER_HF_STRICT_RUNTIME_PREFLIGHT")
76
+ or os.environ.get("HYDRA_STRICT_OPTIMAL_COMPONENTS", "0") == "1"
77
+ or os.environ.get("HYDRA_RUNTIME_PROFILE", "").strip().lower() == "optimal-strict"
78
+ )
79
+
80
+
81
+ def _import_required_module(module_name: str):
82
+ try:
83
+ module = importlib.import_module(module_name)
84
+ except Exception as exc:
85
+ print(f"[strict_preflight] {module_name}=failed {type(exc).__name__}: {exc}", flush=True)
86
+ return None
87
+ print(f"[strict_preflight] {module_name}=ok", flush=True)
88
+ return module
89
+
90
+
91
+ def run_strict_optimal_preflight() -> int:
92
+ """Fail before training if the strict-optimal A10G fast path is unavailable.
93
+
94
+ This is intentionally a runtime/image preflight, not a CPU fallback. It
95
+ verifies the same strict fast-path surfaces that otherwise fail only after a
96
+ paid trainer has finished build/provenance setup.
97
+ """
98
+ print("[strict_preflight] phase=start", flush=True)
99
+ failures: list[str] = []
100
+
101
+ torch = _import_required_module("torch")
102
+ if torch is None:
103
+ failures.append("torch_import")
104
+ else:
105
+ try:
106
+ cuda_available = bool(torch.cuda.is_available())
107
+ device_count = int(torch.cuda.device_count()) if cuda_available else 0
108
+ device_name = torch.cuda.get_device_name(0) if cuda_available and device_count else "<none>"
109
+ if not cuda_available or device_count < 1:
110
+ failures.append("torch_cuda")
111
+ print(
112
+ f"[strict_preflight] torch_cuda=failed cuda_available={int(cuda_available)} device_count={device_count}",
113
+ flush=True,
114
+ )
115
+ else:
116
+ print(
117
+ f"[strict_preflight] torch_cuda=ok device_count={device_count} device0={device_name}",
118
+ flush=True,
119
+ )
120
+ except Exception as exc:
121
+ failures.append("torch_cuda")
122
+ print(f"[strict_preflight] torch_cuda=failed {type(exc).__name__}: {exc}", flush=True)
123
+
124
+ triton = _import_required_module("triton")
125
+ if triton is None:
126
+ failures.append("triton_import")
127
+ else:
128
+ try:
129
+ active = triton.runtime.driver.active
130
+ device = active.get_current_device()
131
+ print(f"[strict_preflight] triton_driver=ok device={device}", flush=True)
132
+ except Exception as exc:
133
+ failures.append("triton_driver")
134
+ print(f"[strict_preflight] triton_driver=failed {type(exc).__name__}: {exc}", flush=True)
135
+
136
+ mamba = _import_required_module("mamba_ssm")
137
+ if mamba is None or not hasattr(mamba, "Mamba3"):
138
+ failures.append("mamba")
139
+ print("[strict_preflight] mamba=missing Mamba3", flush=True)
140
+ else:
141
+ print("[strict_preflight] mamba=ok Mamba3=True", flush=True)
142
+
143
+ fused_sdr = None
144
+ for module_name in ("subsystems.fused_sdr_project",):
145
+ try:
146
+ module = importlib.import_module(module_name)
147
+ except Exception as exc:
148
+ print(f"[strict_preflight] fused_sdr_candidate={module_name} failed {type(exc).__name__}: {exc}", flush=True)
149
+ continue
150
+ if hasattr(module, "FusedSDRProject"):
151
+ fused_sdr = (module_name, module)
152
+ break
153
+ if fused_sdr is None:
154
+ failures.append("fused_sdr")
155
+ print("[strict_preflight] fused_sdr=missing FusedSDRProject", flush=True)
156
+ else:
157
+ module_name, _module = fused_sdr
158
+ print(f"[strict_preflight] fused_sdr=ok module={module_name}", flush=True)
159
+
160
+ htm = _import_required_module("htm_rust")
161
+ if htm is None:
162
+ failures.append("htm_rust")
163
+ else:
164
+ has_region = hasattr(htm, "HTMRegion")
165
+ has_gpu = hasattr(htm, "HTMRegionGpu")
166
+ has_fused = hasattr(htm, "step_batch_fused_cuda")
167
+ if not (has_region and has_gpu and has_fused):
168
+ failures.append("htm_rust")
169
+ print(
170
+ "[strict_preflight] htm_rust=failed "
171
+ f"HTMRegion={has_region} HTMRegionGpu={has_gpu} step_batch_fused_cuda={has_fused}",
172
+ flush=True,
173
+ )
174
+ else:
175
+ print(
176
+ "[strict_preflight] htm_rust=ok "
177
+ f"HTMRegion={has_region} HTMRegionGpu={has_gpu} step_batch_fused_cuda={has_fused}",
178
+ flush=True,
179
+ )
180
+
181
+ if failures:
182
+ print(f"[strict_preflight] phase=failed failures={','.join(failures)}", flush=True)
183
+ return 5
184
+ print("[strict_preflight] phase=done", flush=True)
185
+ return 0
186
+
187
+
188
  def main() -> int:
189
  print("[boot_smoke] phase=start", flush=True)
190
  ensure_repo_on_path()
 
205
  print(f"[boot_smoke] torch_import_failed={type(exc).__name__}: {exc}", flush=True)
206
  return 2
207
 
208
+ if strict_optimal_preflight_requested():
209
+ rc = run_strict_optimal_preflight()
210
+ if rc != 0:
211
+ return rc
212
+
213
  try:
214
  training = importlib.import_module("hydra.training")
215
  required = ["LATEST_CKPT", "PRETRAIN_FINAL_CKPT", "save_ckpt", "maybe_resume_ckpt"]
overlay/scripts/launch_feather_hf_job.py CHANGED
@@ -157,7 +157,7 @@ def build_job_command() -> list[str]:
157
  override = os.environ.get('FEATHER_HF_JOB_COMMAND')
158
  if override:
159
  return shlex.split(override)
160
- if _truthy_env('FEATHER_HF_BOOT_SMOKE'):
161
  return ['python', '/app/scripts/hf_boot_smoke.py']
162
  if _truthy_env('FEATHER_HF_CHECKPOINT_EVAL'):
163
  return ['python', '/app/scripts/hf_checkpoint_eval.py']
@@ -527,6 +527,7 @@ def build_dry_run_manifest(
527
  'receipts_required': {
528
  'space_stage': 'verify before paid launch',
529
  'duplicate_active_job_check': '0 active Feather A10G jobs before launch',
 
530
  'htm_gpu': 'HTMRegionGpu=True and no CPU fallback for faithful rows',
531
  'profile_forward': '0 for TPS rows; 1 only for attribution rows',
532
  'graph_breaks': 'TORCH_LOGS=graph_breaks attached for compile probes',
 
157
  override = os.environ.get('FEATHER_HF_JOB_COMMAND')
158
  if override:
159
  return shlex.split(override)
160
+ if _truthy_env('FEATHER_HF_BOOT_SMOKE') or _truthy_env('FEATHER_HF_STRICT_RUNTIME_PREFLIGHT'):
161
  return ['python', '/app/scripts/hf_boot_smoke.py']
162
  if _truthy_env('FEATHER_HF_CHECKPOINT_EVAL'):
163
  return ['python', '/app/scripts/hf_checkpoint_eval.py']
 
527
  'receipts_required': {
528
  'space_stage': 'verify before paid launch',
529
  'duplicate_active_job_check': '0 active Feather A10G jobs before launch',
530
+ 'strict_runtime_preflight': 'for optimal-strict: run FEATHER_HF_STRICT_RUNTIME_PREFLIGHT=1 and require torch_cuda/triton_driver/mamba/fused_sdr/htm_rust ok before train',
531
  'htm_gpu': 'HTMRegionGpu=True and no CPU fallback for faithful rows',
532
  'profile_forward': '0 for TPS rows; 1 only for attribution rows',
533
  'graph_breaks': 'TORCH_LOGS=graph_breaks attached for compile probes',
overlay/subsystems/fused_sdr_project.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Strict-optimal FusedSDRProject import surface.
2
+
3
+ The production Triton implementation lives in ``archive.fused_sdr_project`` after
4
+ PR #31's source reorganization, but strict-optimal HF runtimes still need a
5
+ stable ``subsystems.fused_sdr_project`` module path. Keep this shim thin so the
6
+ preflight and model path verify the intended fast component without copying the
7
+ kernel body.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import os
12
+
13
+ from archive.fused_sdr_project import FusedSDRProject as _ArchiveFusedSDRProject
14
+
15
+
16
+ class FusedSDRProject:
17
+ """Compatibility wrapper that preserves strict-optimal fail-closed guards."""
18
+
19
+ @staticmethod
20
+ def apply(active_indices, token_ids, sdr_proj_weight, delta_u, delta_v):
21
+ if (
22
+ os.environ.get("HYDRA_STRICT_OPTIMAL_COMPONENTS", "0") == "1"
23
+ and os.environ.get("HYDRA_DISABLE_FUSED_SDR_TRITON", "0") == "1"
24
+ ):
25
+ raise RuntimeError(
26
+ "HYDRA_STRICT_OPTIMAL_COMPONENTS=1 requires FusedSDRProject/Triton; "
27
+ "HYDRA_DISABLE_FUSED_SDR_TRITON=1 is not allowed."
28
+ )
29
+ return _ArchiveFusedSDRProject.apply(active_indices, token_ids, sdr_proj_weight, delta_u, delta_v)
30
+
31
+
32
+ __all__ = ["FusedSDRProject"]