Spaces:
Runtime error
Runtime error
Update Feather training runtime image
Browse files
overlay/scripts/launch_feather_hf_job.py
CHANGED
|
@@ -94,7 +94,7 @@ def require_token() -> str:
|
|
| 94 |
return TOKEN
|
| 95 |
|
| 96 |
|
| 97 |
-
def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
|
| 98 |
"""Wait until the Space image has been built.
|
| 99 |
|
| 100 |
We use the Space purely as a container-image builder for HF Jobs. The Space
|
|
@@ -109,24 +109,27 @@ def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
|
|
| 109 |
and APP_STARTING_ERROR after a successful BUILDING→APP_STARTING transition
|
| 110 |
are acceptable — the image exists in the registry and Jobs can use it.
|
| 111 |
"""
|
| 112 |
-
start = time.time()
|
| 113 |
-
seen_build_completion = False
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
if
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
| 130 |
# Hard build failures — no image was produced.
|
| 131 |
if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
|
| 132 |
raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
|
|
|
|
| 94 |
return TOKEN
|
| 95 |
|
| 96 |
|
| 97 |
+
def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
|
| 98 |
"""Wait until the Space image has been built.
|
| 99 |
|
| 100 |
We use the Space purely as a container-image builder for HF Jobs. The Space
|
|
|
|
| 109 |
and APP_STARTING_ERROR after a successful BUILDING→APP_STARTING transition
|
| 110 |
are acceptable — the image exists in the registry and Jobs can use it.
|
| 111 |
"""
|
| 112 |
+
start = time.time()
|
| 113 |
+
seen_build_completion = False
|
| 114 |
+
seen_building = False
|
| 115 |
+
while True:
|
| 116 |
+
runtime = api.get_space_runtime(repo_id, token=TOKEN)
|
| 117 |
+
stage = getattr(runtime, 'stage', None)
|
| 118 |
+
hardware = getattr(runtime, 'hardware', None)
|
| 119 |
+
err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
|
| 120 |
+
print(f'[space] stage={stage} hardware={hardware}', flush=True)
|
| 121 |
+
if stage == 'BUILDING':
|
| 122 |
+
seen_building = True
|
| 123 |
+
if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
|
| 124 |
+
seen_build_completion = True
|
| 125 |
+
if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
|
| 126 |
+
return
|
| 127 |
+
# Image is built — Jobs can use it regardless of Space boot outcome.
|
| 128 |
+
if (seen_build_completion or seen_building) and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
|
| 129 |
+
print(f'[space] Space boot failed with {stage} but built image is '
|
| 130 |
+
f'available in the Space registry and is usable by HF Jobs.',
|
| 131 |
+
flush=True)
|
| 132 |
+
return
|
| 133 |
# Hard build failures — no image was produced.
|
| 134 |
if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
|
| 135 |
raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
|
overlay/scripts/run_domain_expanded_pretrain.sh
CHANGED
|
@@ -214,10 +214,14 @@ fi
|
|
| 214 |
|
| 215 |
RETINA_PATH="${HYDRA_RETINA_PATH:-$CACHE_ROOT/retina.npz}"
|
| 216 |
if [[ ! -f "$RETINA_PATH" ]]; then
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
"${
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
fi
|
| 222 |
else
|
| 223 |
log "retina_action=skip path=$RETINA_PATH"
|
|
|
|
| 214 |
|
| 215 |
RETINA_PATH="${HYDRA_RETINA_PATH:-$CACHE_ROOT/retina.npz}"
|
| 216 |
if [[ ! -f "$RETINA_PATH" ]]; then
|
| 217 |
+
if [[ "${HYDRA_ALLOW_SYNTHETIC_RETINA:-0}" == "1" ]]; then
|
| 218 |
+
log "retina_action=skip reason=HYDRA_ALLOW_SYNTHETIC_RETINA=1 and retina missing"
|
| 219 |
+
else
|
| 220 |
+
RETINA_CMD=("${PYTHON_CMD[@]}" -c "from subsystems.sdr_retina import build_retina; build_retina()")
|
| 221 |
+
log "retina_action=build command=${RETINA_CMD[*]}"
|
| 222 |
+
if [[ "$DRY_RUN" -eq 0 ]]; then
|
| 223 |
+
"${RETINA_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
|
| 224 |
+
fi
|
| 225 |
fi
|
| 226 |
else
|
| 227 |
log "retina_action=skip path=$RETINA_PATH"
|
overlay/subsystems/sdr_semantic.py
CHANGED
|
@@ -91,21 +91,39 @@ class SemanticFoldingSDR(nn.Module):
|
|
| 91 |
super().__init__()
|
| 92 |
self.vocab_size = vocab_size
|
| 93 |
self.n_bits = n_bits
|
| 94 |
-
self.som_update_interval = int(som_update_interval)
|
| 95 |
-
self.som_warmup_steps = int(som_warmup_steps)
|
| 96 |
-
self.som_alpha = float(som_alpha)
|
| 97 |
-
|
| 98 |
-
path = retina_path or DEFAULT_RETINA_PATH
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
if retina_sdr.shape != (vocab_size, n_bits):
|
| 111 |
raise ValueError(
|
|
|
|
| 91 |
super().__init__()
|
| 92 |
self.vocab_size = vocab_size
|
| 93 |
self.n_bits = n_bits
|
| 94 |
+
self.som_update_interval = int(som_update_interval)
|
| 95 |
+
self.som_warmup_steps = int(som_warmup_steps)
|
| 96 |
+
self.som_alpha = float(som_alpha)
|
| 97 |
+
|
| 98 |
+
path = retina_path or DEFAULT_RETINA_PATH
|
| 99 |
+
retina_path_exists = Path(path).exists()
|
| 100 |
+
allow_synthetic = os.environ.get("HYDRA_ALLOW_SYNTHETIC_RETINA", "0") == "1"
|
| 101 |
+
|
| 102 |
+
if retina_path_exists:
|
| 103 |
+
with np.load(path) as f:
|
| 104 |
+
retina_sdr = f["sdr"] # bool[V, n_bits]
|
| 105 |
+
stored_vocab = int(f["vocab_size"]) if "vocab_size" in f.files else retina_sdr.shape[0]
|
| 106 |
+
stored_nbits = int(f["n_bits"]) if "n_bits" in f.files else retina_sdr.shape[1]
|
| 107 |
+
stored_target = int(f["target_active"]) if "target_active" in f.files else int(retina_sdr[0].sum())
|
| 108 |
+
elif allow_synthetic:
|
| 109 |
+
synth_target = int(target_active) if target_active is not None else DEFAULT_TARGET_ACTIVE
|
| 110 |
+
print(
|
| 111 |
+
f"[retina] missing {path}; HYDRA_ALLOW_SYNTHETIC_RETINA=1 so using synthetic retina "
|
| 112 |
+
f"(vocab={vocab_size}, n_bits={n_bits}, active={synth_target})",
|
| 113 |
+
flush=True,
|
| 114 |
+
)
|
| 115 |
+
base = np.arange(synth_target, dtype=np.int64)[None, :]
|
| 116 |
+
rows = np.arange(vocab_size, dtype=np.int64)[:, None]
|
| 117 |
+
cols = (rows * 2654435761 + base * 1315423911) % n_bits
|
| 118 |
+
retina_sdr = np.zeros((vocab_size, n_bits), dtype=np.bool_)
|
| 119 |
+
retina_sdr[np.arange(vocab_size)[:, None], cols] = True
|
| 120 |
+
stored_vocab = vocab_size
|
| 121 |
+
stored_nbits = n_bits
|
| 122 |
+
stored_target = synth_target
|
| 123 |
+
else:
|
| 124 |
+
raise FileNotFoundError(
|
| 125 |
+
f"Retina not found at {path}. Run subsystems/sdr_retina.py first."
|
| 126 |
+
)
|
| 127 |
|
| 128 |
if retina_sdr.shape != (vocab_size, n_bits):
|
| 129 |
raise ValueError(
|