Spaces:
Runtime error
Runtime error
File size: 8,232 Bytes
e5cf7c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import os
import requests
import base64
token = os.environ.get("HF_TOKEN")
if not token:
token_path = os.path.expanduser("~/.cache/huggingface/token")
if os.path.exists(token_path):
token = open(token_path).read().strip()
if not token:
raise SystemExit("HF_TOKEN missing")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"
hotpatch_script = r'''
import os
from pathlib import Path
def patch(path, old, new, label=None):
p = Path(path)
if not p.exists():
print(f'[hotpatch] skip missing {path}')
return False
s = p.read_text()
if old in s:
p.write_text(s.replace(old, new))
print(f'[hotpatch] patched {label or path}')
return True
print(f'[hotpatch] no-match {label or path}')
return False
# 0. config.py: ensure checkpoint constants exist for stale runtime images.
p0 = Path('/workspace/feather/hydra/config.py')
if p0.exists():
s = p0.read_text()
if 'CKPT_INTERVAL =' not in s:
s += '\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\n'
if 'CKPT_ROTATIONS =' not in s:
s += 'CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\n'
if 'RESUME_CKPT =' not in s:
s += 'RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n'
p0.write_text(s)
print('[hotpatch] config.py checkpoint constants ensured')
# 1. training.py: patch stale image checkpoint globals/function NameErrors.
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
s = p1.read_text()
# Import checkpoint symbols if this stale image did not import them from config.
if ' CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,' not in s and ' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,' in s:
s = s.replace(' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,', ' USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,\n CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,')
# Module-level fallbacks: safe even if imports are absent/old.
marker = 'TIME_BUDGET = int(os.environ.get("HYDRA_TIME_BUDGET", str(_TIME_BUDGET)))'
if marker in s and '_CKPT_WORKER_THREAD = None' not in s:
s = s.replace(marker, marker + '\nCACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", str(globals().get("CKPT_INTERVAL", 1000))))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", str(globals().get("CKPT_ROTATIONS", 3))))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", str(globals().get("RESUME_CKPT", "none"))))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None')
elif '_CKPT_WORKER_THREAD = None' not in s:
s = s.replace('# ---------------------------------------------------------------------------\n# Schedules', 'CACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None\n\n# ---------------------------------------------------------------------------\n# Schedules')
# Ensure these globals exist even if a previous partial hotpatch already added _CKPT_WORKER_THREAD without them.
if '_prof = False' not in s:
s = s.replace('_CKPT_WORKER_THREAD = None', '_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None', 1)
# maybe_resume_ckpt should not die if global import/definition failed.
s = s.replace(' if not RESUME_CKPT or RESUME_CKPT.lower() == "none":', ' resume_ckpt = globals().get("RESUME_CKPT", os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none")))\n if not resume_ckpt or str(resume_ckpt).lower() == "none":')
s = s.replace(' resume_path = Path(os.path.expanduser(RESUME_CKPT))', ' resume_path = Path(os.path.expanduser(str(resume_ckpt)))')
# save_ckpt missing blocking arg/global in stale images.
s = s.replace(' val_bpb: float | None = None,\n) -> None:\n try:', ' val_bpb: float | None = None,\n blocking: bool = False,\n) -> None:\n global _CKPT_WORKER_THREAD\n try:')
s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:')
p1.write_text(s)
print('[hotpatch] training.py checkpoint/runtime guards v14')
# 2. htm.py: stub Rust HTM if native extension is unavailable/build-broken.
p_htm = Path('/workspace/feather/subsystems/htm.py')
if p_htm.exists():
s = p_htm.read_text()
if 'class _StubRegion' not in s:
stub = "\nclass _StubRegion:\n def __init__(self, *a, **k): self.n_columns=2048\n def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\n def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\n def reset(self): pass\n"
s = s.replace('import htm_rust', 'import htm_rust' + stub)
s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion')
s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", _HTMStub)', '_HTM_REGION_CLS = _StubRegion')
p_htm.write_text(s)
print('[hotpatch] htm.py stub ensured')
# 3. streaming/data/cache fixes.
patch('/workspace/feather/prepare_nemotron.py',
'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"',
'local_only = False', 'prepare_nemotron local_only=False')
patch('/workspace/feather/subsystems/sdr_retina.py',
'icarus112/feather-retina-cache',
'GAInTech/feather-retina-cache', 'retina cache repo')
# 4. SDR semantic device movement/backcompat fixes.
p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py')
if p_sem.exists():
s = p_sem.read_text()
s = s.replace('contrastive_rank: int = 64,\n ) -> None:',
'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:')
old_apply = ' self._retina_indices = fn(self._retina_indices)'
new_apply = ' if hasattr(self, "_retina_indices") and self._retina_indices is not None:\n self._retina_indices = fn(self._retina_indices)\n if hasattr(self, "_retina_data") and self._retina_data is not None:\n self._retina_data = fn(self._retina_data)'
if old_apply in s:
s = s.replace(old_apply, new_apply)
if 'self.hebbian_alpha =' not in s:
s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\n self.hebbian_alpha = 0.01')
p_sem.write_text(s)
print('[hotpatch] sdr_semantic.py guards ensured')
'''
encoded = base64.b64encode(hotpatch_script.encode()).decode()
command = [
"/bin/bash", "-c",
f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py"
]
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_BACKGROUND_PREFETCH": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1",
"HYDRA_FASTPATH": "0",
"HYDRA_FUSED_SDR_PROJECT": "0",
"HYDRA_HTM_FUSED": "0",
"HYDRA_MODEL_COMPILE": "0",
"HYDRA_MUON_COMPILE": "0",
"PYTHONUNBUFFERED": "1",
"HYDRA_RESUME_CKPT": "none",
"HYDRA_CKPT_INTERVAL": "1000",
"HYDRA_CKPT_ROTATIONS": "3",
"HYDRA_HYENA_LAYERS": "0,1,2,3",
"HYDRA_N_LAYER": "4",
"TORCH_COMPILE_BACKEND": "eager",
"DYNAMO_DISABLE": "1",
}
payload = {
"spaceId": space_id,
"command": command,
"environment": env,
"secrets": {"HF_TOKEN": token},
"flavor": "a10g-large",
"timeout": "12h",
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
r = requests.post(url, json=payload, headers=headers)
print(f"status={r.status_code}")
print(r.text)
if r.status_code != 200:
raise SystemExit(1)
|