Jackoatmon commited on
Commit
49f6ada
·
verified ·
1 Parent(s): 0425b2b

Update Feather training runtime image

Browse files
overlay/scripts/launch_feather_hf_job.py CHANGED
@@ -94,7 +94,7 @@ def require_token() -> str:
94
  return TOKEN
95
 
96
 
97
- def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
98
  """Wait until the Space image has been built.
99
 
100
  We use the Space purely as a container-image builder for HF Jobs. The Space
@@ -109,24 +109,27 @@ def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
109
  and APP_STARTING_ERROR after a successful BUILDING→APP_STARTING transition
110
  are acceptable — the image exists in the registry and Jobs can use it.
111
  """
112
- start = time.time()
113
- seen_build_completion = False
114
- while True:
115
- runtime = api.get_space_runtime(repo_id, token=TOKEN)
116
- stage = getattr(runtime, 'stage', None)
117
- hardware = getattr(runtime, 'hardware', None)
118
- err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
119
- print(f'[space] stage={stage} hardware={hardware}', flush=True)
120
- if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
121
- seen_build_completion = True
122
- if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
123
- return
124
- # Image is built — Jobs can use it regardless of Space boot outcome.
125
- if seen_build_completion and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
126
- print(f'[space] Space boot failed with {stage} but built image is '
127
- f'available in the Space registry and is usable by HF Jobs.',
128
- flush=True)
129
- return
 
 
 
130
  # Hard build failures — no image was produced.
131
  if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
132
  raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
 
94
  return TOKEN
95
 
96
 
97
+ def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
98
  """Wait until the Space image has been built.
99
 
100
  We use the Space purely as a container-image builder for HF Jobs. The Space
 
109
  and APP_STARTING_ERROR after a successful BUILDING→APP_STARTING transition
110
  are acceptable — the image exists in the registry and Jobs can use it.
111
  """
112
+ start = time.time()
113
+ seen_build_completion = False
114
+ seen_building = False
115
+ while True:
116
+ runtime = api.get_space_runtime(repo_id, token=TOKEN)
117
+ stage = getattr(runtime, 'stage', None)
118
+ hardware = getattr(runtime, 'hardware', None)
119
+ err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
120
+ print(f'[space] stage={stage} hardware={hardware}', flush=True)
121
+ if stage == 'BUILDING':
122
+ seen_building = True
123
+ if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
124
+ seen_build_completion = True
125
+ if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
126
+ return
127
+ # Image is built Jobs can use it regardless of Space boot outcome.
128
+ if (seen_build_completion or seen_building) and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
129
+ print(f'[space] Space boot failed with {stage} but built image is '
130
+ f'available in the Space registry and is usable by HF Jobs.',
131
+ flush=True)
132
+ return
133
  # Hard build failures — no image was produced.
134
  if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
135
  raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
overlay/scripts/run_domain_expanded_pretrain.sh CHANGED
@@ -214,10 +214,14 @@ fi
214
 
215
  RETINA_PATH="${HYDRA_RETINA_PATH:-$CACHE_ROOT/retina.npz}"
216
  if [[ ! -f "$RETINA_PATH" ]]; then
217
- RETINA_CMD=("${PYTHON_CMD[@]}" -c "from subsystems.sdr_retina import build_retina; build_retina()")
218
- log "retina_action=build command=${RETINA_CMD[*]}"
219
- if [[ "$DRY_RUN" -eq 0 ]]; then
220
- "${RETINA_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
 
 
 
 
221
  fi
222
  else
223
  log "retina_action=skip path=$RETINA_PATH"
 
214
 
215
  RETINA_PATH="${HYDRA_RETINA_PATH:-$CACHE_ROOT/retina.npz}"
216
  if [[ ! -f "$RETINA_PATH" ]]; then
217
+ if [[ "${HYDRA_ALLOW_SYNTHETIC_RETINA:-0}" == "1" ]]; then
218
+ log "retina_action=skip reason=HYDRA_ALLOW_SYNTHETIC_RETINA=1 and retina missing"
219
+ else
220
+ RETINA_CMD=("${PYTHON_CMD[@]}" -c "from subsystems.sdr_retina import build_retina; build_retina()")
221
+ log "retina_action=build command=${RETINA_CMD[*]}"
222
+ if [[ "$DRY_RUN" -eq 0 ]]; then
223
+ "${RETINA_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
224
+ fi
225
  fi
226
  else
227
  log "retina_action=skip path=$RETINA_PATH"
overlay/subsystems/sdr_semantic.py CHANGED
@@ -91,21 +91,39 @@ class SemanticFoldingSDR(nn.Module):
91
  super().__init__()
92
  self.vocab_size = vocab_size
93
  self.n_bits = n_bits
94
- self.som_update_interval = int(som_update_interval)
95
- self.som_warmup_steps = int(som_warmup_steps)
96
- self.som_alpha = float(som_alpha)
97
-
98
- path = retina_path or DEFAULT_RETINA_PATH
99
- if not Path(path).exists():
100
- raise FileNotFoundError(
101
- f"Retina not found at {path}. Run subsystems/sdr_retina.py first."
102
- )
103
-
104
- with np.load(path) as f:
105
- retina_sdr = f["sdr"] # bool[V, n_bits]
106
- stored_vocab = int(f["vocab_size"]) if "vocab_size" in f.files else retina_sdr.shape[0]
107
- stored_nbits = int(f["n_bits"]) if "n_bits" in f.files else retina_sdr.shape[1]
108
- stored_target = int(f["target_active"]) if "target_active" in f.files else int(retina_sdr[0].sum())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  if retina_sdr.shape != (vocab_size, n_bits):
111
  raise ValueError(
 
91
  super().__init__()
92
  self.vocab_size = vocab_size
93
  self.n_bits = n_bits
94
+ self.som_update_interval = int(som_update_interval)
95
+ self.som_warmup_steps = int(som_warmup_steps)
96
+ self.som_alpha = float(som_alpha)
97
+
98
+ path = retina_path or DEFAULT_RETINA_PATH
99
+ retina_path_exists = Path(path).exists()
100
+ allow_synthetic = os.environ.get("HYDRA_ALLOW_SYNTHETIC_RETINA", "0") == "1"
101
+
102
+ if retina_path_exists:
103
+ with np.load(path) as f:
104
+ retina_sdr = f["sdr"] # bool[V, n_bits]
105
+ stored_vocab = int(f["vocab_size"]) if "vocab_size" in f.files else retina_sdr.shape[0]
106
+ stored_nbits = int(f["n_bits"]) if "n_bits" in f.files else retina_sdr.shape[1]
107
+ stored_target = int(f["target_active"]) if "target_active" in f.files else int(retina_sdr[0].sum())
108
+ elif allow_synthetic:
109
+ synth_target = int(target_active) if target_active is not None else DEFAULT_TARGET_ACTIVE
110
+ print(
111
+ f"[retina] missing {path}; HYDRA_ALLOW_SYNTHETIC_RETINA=1 so using synthetic retina "
112
+ f"(vocab={vocab_size}, n_bits={n_bits}, active={synth_target})",
113
+ flush=True,
114
+ )
115
+ base = np.arange(synth_target, dtype=np.int64)[None, :]
116
+ rows = np.arange(vocab_size, dtype=np.int64)[:, None]
117
+ cols = (rows * 2654435761 + base * 1315423911) % n_bits
118
+ retina_sdr = np.zeros((vocab_size, n_bits), dtype=np.bool_)
119
+ retina_sdr[np.arange(vocab_size)[:, None], cols] = True
120
+ stored_vocab = vocab_size
121
+ stored_nbits = n_bits
122
+ stored_target = synth_target
123
+ else:
124
+ raise FileNotFoundError(
125
+ f"Retina not found at {path}. Run subsystems/sdr_retina.py first."
126
+ )
127
 
128
  if retina_sdr.shape != (vocab_size, n_bits):
129
  raise ValueError(