icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
import os
import requests
import sys
import base64
token = os.environ.get("HF_TOKEN")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"
hotpatch_script = """
import os
import sys
from pathlib import Path
def patch(path, old, new):
p = Path(path)
if not p.exists(): return
s = p.read_text()
if old in s:
p.write_text(s.replace(old, new))
print(f'[hotpatch] patched {path}')
# 1. training.py: Final Boss Fix (Checkpointing + Imports)
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
s = p1.read_text()
# Imports fix
im_old = 'WARMUP_RATIO, WEIGHT_DECAY,'
im_new = 'WARMUP_RATIO, WEIGHT_DECAY, CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,'
if im_old in s: s = s.replace(im_old, im_new)
# Safely define variables if imports failed
pre_main = '''
_prof = False
ema_model = None
try: _ci = CKPT_INTERVAL
except: _ci = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))
CKPT_INTERVAL = _ci
try: _cr = CKPT_ROTATIONS
except: _cr = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))
CKPT_ROTATIONS = _cr
'''
for m_line in ['def main() -> None:', 'def main():']:
if m_line in s: s = s.replace(m_line, m_line + pre_main)
# MDLM logic
s = s.replace('mdlm_mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)',
'_m = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))\\n mdlm_mask_id = _m if _m >= 0 else (vocab_size - 1)')
# RESUME logic
raw_resume = 'os.environ.get("HYDRA_RESUME_CKPT", "none")'
s = s.replace('if not RESUME_CKPT or RESUME_CKPT.lower() == "none":',
f'_r = {raw_resume}\\n if not _r or _r.lower() == "none":')
s = s.replace('resume_path = Path(os.path.expanduser(RESUME_CKPT))',
f'resume_path = Path(os.path.expanduser({raw_resume}))')
# Catch block NameError
s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:')
p1.write_text(s)
print('[hotpatch] training.py absolute supreme final fix')
# 2. htm.py: Stub
p_htm = Path('/workspace/feather/subsystems/htm.py')
if p_htm.exists():
s = p_htm.read_text()
if 'class _StubRegion' not in s:
stub = "\\nclass _StubRegion:\\n def __init__(self, *a, **k): self.n_columns=2048\\n def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\\n def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\\n def reset(self): pass\\n"
s = s.replace('import htm_rust', 'import htm_rust' + stub)
s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion')
p_htm.write_text(s)
# 3. stream fix
patch('/workspace/feather/prepare_nemotron.py',
'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == \"1\"',
'local_only = False')
# 4. sdr_semantic.py: DEVICE MOVEMENT FIX
p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py')
if p_sem.exists():
s = p_sem.read_text()
s = s.replace('contrastive_rank: int = 64,\\n ) -> None:',
'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:')
old_apply = ' self._retina_indices = fn(self._retina_indices)'
new_apply = ''' if hasattr(self, "_retina_indices") and self._retina_indices is not None:
self._retina_indices = fn(self._retina_indices)
if hasattr(self, "_retina_data") and self._retina_data is not None:
self._retina_data = fn(self._retina_data)'''
if old_apply in s: s = s.replace(old_apply, new_apply)
if 'self.hebbian_alpha =' not in s:
s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\\n self.hebbian_alpha = 0.01')
p_sem.write_text(s)
# 5. sdr_retina.py: Repo fix
patch('/workspace/feather/subsystems/sdr_retina.py',
'icarus112/feather-retina-cache',
'GAInTech/feather-retina-cache')
"""
encoded = base64.b64encode(hotpatch_script.encode()).decode()
command = [
"/bin/bash", "-c",
f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py"
]
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1",
"HYDRA_FASTPATH": "0",
"HYDRA_MODEL_COMPILE": "0",
"HYDRA_MUON_COMPILE": "0",
"PYTHONUNBUFFERED": "1",
"HYDRA_RESUME_CKPT": "none",
"HYDRA_HYENA_LAYERS": "0,1,2,3",
"HYDRA_N_LAYER": "4",
"TORCH_COMPILE_BACKEND": "eager",
"DYNAMO_DISABLE": "1"
}
payload = {
"spaceId": space_id,
"command": command,
"environment": env,
"secrets": {"HF_TOKEN": token},
"flavor": "a10g-large",
"timeout": "12h"
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
r = requests.post(url, json=payload, headers=headers)
if r.status_code == 200:
print(f"Success! Job ID: {r.json()['id']}")
else:
print(f"Error {r.status_code}: {r.text}")