import os import requests import base64 token = os.environ.get("HF_TOKEN") if not token: token_path = os.path.expanduser("~/.cache/huggingface/token") if os.path.exists(token_path): token = open(token_path).read().strip() if not token: raise SystemExit("HF_TOKEN missing") namespace = "GAInTech" space_id = "GAInTech/feather-a10g-gt80k-runtime-public" hotpatch_script = r''' import os from pathlib import Path def patch(path, old, new, label=None): p = Path(path) if not p.exists(): print(f'[hotpatch] skip missing {path}') return False s = p.read_text() if old in s: p.write_text(s.replace(old, new)) print(f'[hotpatch] patched {label or path}') return True print(f'[hotpatch] no-match {label or path}') return False # 0. config.py: ensure checkpoint constants exist for stale runtime images. p0 = Path('/workspace/feather/hydra/config.py') if p0.exists(): s = p0.read_text() if 'CKPT_INTERVAL =' not in s: s += '\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\n' if 'CKPT_ROTATIONS =' not in s: s += 'CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\n' if 'RESUME_CKPT =' not in s: s += 'RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n' p0.write_text(s) print('[hotpatch] config.py checkpoint constants ensured') # 1. training.py: patch stale image checkpoint globals/function NameErrors. p1 = Path('/workspace/feather/hydra/training.py') if p1.exists(): s = p1.read_text() # Import checkpoint symbols if this stale image did not import them from config. if ' CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,' not in s and ' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,' in s: s = s.replace(' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,', ' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,\n CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,') # Module-level fallbacks: safe even if imports are absent/old. marker = 'TIME_BUDGET = int(os.environ.get("HYDRA_TIME_BUDGET", str(_TIME_BUDGET)))' if marker in s and '_CKPT_WORKER_THREAD = None' not in s: s = s.replace(marker, marker + '\nCACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", str(globals().get("CKPT_INTERVAL", 1000))))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", str(globals().get("CKPT_ROTATIONS", 3))))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", str(globals().get("RESUME_CKPT", "none"))))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None') elif '_CKPT_WORKER_THREAD = None' not in s: s = s.replace('# ---------------------------------------------------------------------------\n# Schedules', 'CACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None\n\n# ---------------------------------------------------------------------------\n# Schedules') # Ensure these globals exist even if a previous partial hotpatch already added _CKPT_WORKER_THREAD without them. if '_prof = False' not in s: s = s.replace('_CKPT_WORKER_THREAD = None', '_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None', 1) # maybe_resume_ckpt should not die if global import/definition failed. s = s.replace(' if not RESUME_CKPT or RESUME_CKPT.lower() == "none":', ' resume_ckpt = globals().get("RESUME_CKPT", os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none")))\n if not resume_ckpt or str(resume_ckpt).lower() == "none":') s = s.replace(' resume_path = Path(os.path.expanduser(RESUME_CKPT))', ' resume_path = Path(os.path.expanduser(str(resume_ckpt)))') # save_ckpt missing blocking arg/global in stale images. s = s.replace(' val_bpb: float | None = None,\n) -> None:\n try:', ' val_bpb: float | None = None,\n blocking: bool = False,\n) -> None:\n global _CKPT_WORKER_THREAD\n try:') s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:') p1.write_text(s) print('[hotpatch] training.py checkpoint/runtime guards v14') # 2. htm.py: stub Rust HTM if native extension is unavailable/build-broken. p_htm = Path('/workspace/feather/subsystems/htm.py') if p_htm.exists(): s = p_htm.read_text() if 'class _StubRegion' not in s: stub = "\nclass _StubRegion:\n def __init__(self, *a, **k): self.n_columns=2048\n def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\n def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\n def reset(self): pass\n" s = s.replace('import htm_rust', 'import htm_rust' + stub) s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion') s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", _HTMStub)', '_HTM_REGION_CLS = _StubRegion') p_htm.write_text(s) print('[hotpatch] htm.py stub ensured') # 3. streaming/data/cache fixes. patch('/workspace/feather/prepare_nemotron.py', 'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"', 'local_only = False', 'prepare_nemotron local_only=False') patch('/workspace/feather/subsystems/sdr_retina.py', 'icarus112/feather-retina-cache', 'GAInTech/feather-retina-cache', 'retina cache repo') # 4. SDR semantic device movement/backcompat fixes. p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py') if p_sem.exists(): s = p_sem.read_text() s = s.replace('contrastive_rank: int = 64,\n ) -> None:', 'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:') old_apply = ' self._retina_indices = fn(self._retina_indices)' new_apply = ' if hasattr(self, "_retina_indices") and self._retina_indices is not None:\n self._retina_indices = fn(self._retina_indices)\n if hasattr(self, "_retina_data") and self._retina_data is not None:\n self._retina_data = fn(self._retina_data)' if old_apply in s: s = s.replace(old_apply, new_apply) if 'self.hebbian_alpha =' not in s: s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\n self.hebbian_alpha = 0.01') p_sem.write_text(s) print('[hotpatch] sdr_semantic.py guards ensured') ''' encoded = base64.b64encode(hotpatch_script.encode()).decode() command = [ "/bin/bash", "-c", f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py" ] env = { "FEATHER_RUNTIME_MODE": "job", "HYDRA_BATCH_SIZE": "96", "HYDRA_TOTAL_BATCH": "196608", "HYDRA_USE_NEMOTRON": "1", "HYDRA_TARGET_SHARDS": "0", "HYDRA_BACKGROUND_PREFETCH": "0", "HYDRA_FORCE_HTM_CPU": "1", "HYDRA_INERT_MAMBA": "1", "HYDRA_FASTPATH": "0", "HYDRA_FUSED_SDR_PROJECT": "0", "HYDRA_HTM_FUSED": "0", "HYDRA_MODEL_COMPILE": "0", "HYDRA_MUON_COMPILE": "0", "PYTHONUNBUFFERED": "1", "HYDRA_RESUME_CKPT": "none", "HYDRA_CKPT_INTERVAL": "1000", "HYDRA_CKPT_ROTATIONS": "3", "HYDRA_HYENA_LAYERS": "0,1,2,3", "HYDRA_N_LAYER": "4", "TORCH_COMPILE_BACKEND": "eager", "DYNAMO_DISABLE": "1", } payload = { "spaceId": space_id, "command": command, "environment": env, "secrets": {"HF_TOKEN": token}, "flavor": "a10g-large", "timeout": "12h", } url = f"https://huggingface.co/api/jobs/{namespace}" headers = {"Authorization": f"Bearer {token}"} r = requests.post(url, json=payload, headers=headers) print(f"status={r.status_code}") print(r.text) if r.status_code != 200: raise SystemExit(1)