icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
import os
import requests
import sys
token = os.environ.get("HF_TOKEN")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"
hotpatch_script = """
import os
from pathlib import Path
# 1. Fix MDLM_MASK_ID NameError in hydra/training.py
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
s = p1.read_text()
old = "mdlm_mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)"
new = "mdlm_mask_id = int(os.environ.get('HYDRA_MDLM_MASK_ID', str(vocab_size - 1)))"
if old in s:
p1.write_text(s.replace(old, new))
print('[hotpatch] patched training.py')
# 2. Fix HTMRegion missing in subsystems/htm.py (Stub)
p2 = Path('/workspace/feather/subsystems/htm.py')
if p2.exists():
s = p2.read_text()
if 'class _HTMStubRegion' not in s:
stub = "\\nclass _HTMStubRegion:\\n def __init__(self, *a, **k): self.n_columns=2048\\n def step(self, *a, **k): return (None, None, None, 1.0)\\n def step_many(self, sdr, *a): T=sdr.shape[0]; return (None, [1.0]*T)\\n def reset(self): pass\\n"
s = s.replace('import htm_rust', 'import htm_rust' + stub)
s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", _HTMStubRegion)')
p2.write_text(s)
print('[hotpatch] patched htm.py')
# 3. Fix HYDRA_LOCAL_SHARDS_ONLY in prepare_nemotron.py
p3 = Path('/workspace/feather/prepare_nemotron.py')
if p3.exists():
s = p3.read_text()
old = 'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"'
new = 'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "0") == "1"'
if old in s:
p3.write_text(s.replace(old, new))
print('[hotpatch] patched prepare_nemotron.py')
"""
command = [
"/bin/bash", "-lc",
f"python3 -c {repr(hotpatch_script)} && python /app/entrypoint.py"
]
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1",
"HYDRA_FASTPATH": "1",
"PYTHONUNBUFFERED": "1"
}
payload = {
"spaceId": space_id,
"command": command,
"environment": env,
"secrets": {"HF_TOKEN": token},
"flavor": "a10g-large",
"timeout": "12h"
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
print(f"Submitting HOTPATCH launch to {url}...")
r = requests.post(url, json=payload, headers=headers)
if r.status_code == 200:
print(f"Success! Job ID: {r.json()['id']}")
else:
print(f"Error {r.status_code}: {r.text}")