Spaces:
Runtime error
Runtime error
File size: 5,292 Bytes
e5cf7c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import os
import requests
import sys
import base64
token = os.environ.get("HF_TOKEN")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"
hotpatch_script = """
import os
import sys
from pathlib import Path
def patch(path, old, new):
p = Path(path)
if not p.exists(): return
s = p.read_text()
if old in s:
p.write_text(s.replace(old, new))
print(f'[hotpatch] patched {path}')
# 1. training.py: Final Boss Fix (Checkpointing + Imports)
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
s = p1.read_text()
# Imports fix
im_old = 'WARMUP_RATIO, WEIGHT_DECAY,'
im_new = 'WARMUP_RATIO, WEIGHT_DECAY, CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,'
if im_old in s: s = s.replace(im_old, im_new)
# Safely define variables if imports failed
pre_main = '''
_prof = False
ema_model = None
try: _ci = CKPT_INTERVAL
except: _ci = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))
CKPT_INTERVAL = _ci
try: _cr = CKPT_ROTATIONS
except: _cr = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))
CKPT_ROTATIONS = _cr
'''
for m_line in ['def main() -> None:', 'def main():']:
if m_line in s: s = s.replace(m_line, m_line + pre_main)
# MDLM logic
s = s.replace('mdlm_mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)',
'_m = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))\\n mdlm_mask_id = _m if _m >= 0 else (vocab_size - 1)')
# RESUME logic
raw_resume = 'os.environ.get("HYDRA_RESUME_CKPT", "none")'
s = s.replace('if not RESUME_CKPT or RESUME_CKPT.lower() == "none":',
f'_r = {raw_resume}\\n if not _r or _r.lower() == "none":')
s = s.replace('resume_path = Path(os.path.expanduser(RESUME_CKPT))',
f'resume_path = Path(os.path.expanduser({raw_resume}))')
# Catch block NameError
s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:')
p1.write_text(s)
print('[hotpatch] training.py absolute supreme final fix')
# 2. htm.py: Stub
p_htm = Path('/workspace/feather/subsystems/htm.py')
if p_htm.exists():
s = p_htm.read_text()
if 'class _StubRegion' not in s:
stub = "\\nclass _StubRegion:\\n def __init__(self, *a, **k): self.n_columns=2048\\n def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\\n def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\\n def reset(self): pass\\n"
s = s.replace('import htm_rust', 'import htm_rust' + stub)
s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion')
p_htm.write_text(s)
# 3. stream fix
patch('/workspace/feather/prepare_nemotron.py',
'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == \"1\"',
'local_only = False')
# 4. sdr_semantic.py: DEVICE MOVEMENT FIX
p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py')
if p_sem.exists():
s = p_sem.read_text()
s = s.replace('contrastive_rank: int = 64,\\n ) -> None:',
'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:')
old_apply = ' self._retina_indices = fn(self._retina_indices)'
new_apply = ''' if hasattr(self, "_retina_indices") and self._retina_indices is not None:
self._retina_indices = fn(self._retina_indices)
if hasattr(self, "_retina_data") and self._retina_data is not None:
self._retina_data = fn(self._retina_data)'''
if old_apply in s: s = s.replace(old_apply, new_apply)
if 'self.hebbian_alpha =' not in s:
s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\\n self.hebbian_alpha = 0.01')
p_sem.write_text(s)
# 5. sdr_retina.py: Repo fix
patch('/workspace/feather/subsystems/sdr_retina.py',
'icarus112/feather-retina-cache',
'GAInTech/feather-retina-cache')
"""
encoded = base64.b64encode(hotpatch_script.encode()).decode()
command = [
"/bin/bash", "-c",
f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py"
]
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1",
"HYDRA_FASTPATH": "0",
"HYDRA_MODEL_COMPILE": "0",
"HYDRA_MUON_COMPILE": "0",
"PYTHONUNBUFFERED": "1",
"HYDRA_RESUME_CKPT": "none",
"HYDRA_HYENA_LAYERS": "0,1,2,3",
"HYDRA_N_LAYER": "4",
"TORCH_COMPILE_BACKEND": "eager",
"DYNAMO_DISABLE": "1"
}
payload = {
"spaceId": space_id,
"command": command,
"environment": env,
"secrets": {"HF_TOKEN": token},
"flavor": "a10g-large",
"timeout": "12h"
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
r = requests.post(url, json=payload, headers=headers)
if r.status_code == 200:
print(f"Success! Job ID: {r.json()['id']}")
else:
print(f"Error {r.status_code}: {r.text}")
|