File size: 8,232 Bytes
e5cf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import requests
import base64


token = os.environ.get("HF_TOKEN")
if not token:
    token_path = os.path.expanduser("~/.cache/huggingface/token")
    if os.path.exists(token_path):
        token = open(token_path).read().strip()
if not token:
    raise SystemExit("HF_TOKEN missing")

namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"

hotpatch_script = r'''
import os
from pathlib import Path

def patch(path, old, new, label=None):
    p = Path(path)
    if not p.exists():
        print(f'[hotpatch] skip missing {path}')
        return False
    s = p.read_text()
    if old in s:
        p.write_text(s.replace(old, new))
        print(f'[hotpatch] patched {label or path}')
        return True
    print(f'[hotpatch] no-match {label or path}')
    return False

# 0. config.py: ensure checkpoint constants exist for stale runtime images.
p0 = Path('/workspace/feather/hydra/config.py')
if p0.exists():
    s = p0.read_text()
    if 'CKPT_INTERVAL =' not in s:
        s += '\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\n'
    if 'CKPT_ROTATIONS =' not in s:
        s += 'CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\n'
    if 'RESUME_CKPT =' not in s:
        s += 'RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n'
    p0.write_text(s)
    print('[hotpatch] config.py checkpoint constants ensured')

# 1. training.py: patch stale image checkpoint globals/function NameErrors.
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
    s = p1.read_text()
    # Import checkpoint symbols if this stale image did not import them from config.
    if '    CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,' not in s and '    USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,' in s:
        s = s.replace('    USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,', '    USE_MDLM, MDLM_MASK_ID, MDLM_SCHEDULE,\n    CKPT_INTERVAL, CKPT_ROTATIONS, RESUME_CKPT,')
    # Module-level fallbacks: safe even if imports are absent/old.
    marker = 'TIME_BUDGET = int(os.environ.get("HYDRA_TIME_BUDGET", str(_TIME_BUDGET)))'
    if marker in s and '_CKPT_WORKER_THREAD = None' not in s:
        s = s.replace(marker, marker + '\nCACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", str(globals().get("CKPT_INTERVAL", 1000))))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", str(globals().get("CKPT_ROTATIONS", 3))))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", str(globals().get("RESUME_CKPT", "none"))))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None')
    elif '_CKPT_WORKER_THREAD = None' not in s:
        s = s.replace('# ---------------------------------------------------------------------------\n# Schedules', 'CACHE_DIR = Path(os.environ.get("HYDRA_CACHE_DIR", str(Path.home() / ".cache" / "autoresearch")))\nCKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "1000"))\nCKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))\nRESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none"))\n_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None\n\n# ---------------------------------------------------------------------------\n# Schedules')
    # Ensure these globals exist even if a previous partial hotpatch already added _CKPT_WORKER_THREAD without them.
    if '_prof = False' not in s:
        s = s.replace('_CKPT_WORKER_THREAD = None', '_CKPT_WORKER_THREAD = None\n_prof = False\nema_model = None', 1)
    # maybe_resume_ckpt should not die if global import/definition failed.
    s = s.replace('    if not RESUME_CKPT or RESUME_CKPT.lower() == "none":', '    resume_ckpt = globals().get("RESUME_CKPT", os.environ.get("HYDRA_RESUME_CKPT", os.environ.get("FEATHER_RESUME_CKPT", "none")))\n    if not resume_ckpt or str(resume_ckpt).lower() == "none":')
    s = s.replace('    resume_path = Path(os.path.expanduser(RESUME_CKPT))', '    resume_path = Path(os.path.expanduser(str(resume_ckpt)))')
    # save_ckpt missing blocking arg/global in stale images.
    s = s.replace('    val_bpb: float | None = None,\n) -> None:\n    try:', '    val_bpb: float | None = None,\n    blocking: bool = False,\n) -> None:\n    global _CKPT_WORKER_THREAD\n    try:')
    s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:')
    p1.write_text(s)
    print('[hotpatch] training.py checkpoint/runtime guards v14')

# 2. htm.py: stub Rust HTM if native extension is unavailable/build-broken.
p_htm = Path('/workspace/feather/subsystems/htm.py')
if p_htm.exists():
    s = p_htm.read_text()
    if 'class _StubRegion' not in s:
        stub = "\nclass _StubRegion:\n    def __init__(self, *a, **k): self.n_columns=2048\n    def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\n    def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\n    def reset(self): pass\n"
        s = s.replace('import htm_rust', 'import htm_rust' + stub)
        s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion')
        s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", _HTMStub)', '_HTM_REGION_CLS = _StubRegion')
        p_htm.write_text(s)
        print('[hotpatch] htm.py stub ensured')

# 3. streaming/data/cache fixes.
patch('/workspace/feather/prepare_nemotron.py',
      'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"',
      'local_only = False', 'prepare_nemotron local_only=False')
patch('/workspace/feather/subsystems/sdr_retina.py',
      'icarus112/feather-retina-cache',
      'GAInTech/feather-retina-cache', 'retina cache repo')

# 4. SDR semantic device movement/backcompat fixes.
p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py')
if p_sem.exists():
    s = p_sem.read_text()
    s = s.replace('contrastive_rank: int = 64,\n    ) -> None:',
                  'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:')
    old_apply = '        self._retina_indices = fn(self._retina_indices)'
    new_apply = '        if hasattr(self, "_retina_indices") and self._retina_indices is not None:\n            self._retina_indices = fn(self._retina_indices)\n        if hasattr(self, "_retina_data") and self._retina_data is not None:\n            self._retina_data = fn(self._retina_data)'
    if old_apply in s:
        s = s.replace(old_apply, new_apply)
    if 'self.hebbian_alpha =' not in s:
        s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\n        self.hebbian_alpha = 0.01')
    p_sem.write_text(s)
    print('[hotpatch] sdr_semantic.py guards ensured')
'''

encoded = base64.b64encode(hotpatch_script.encode()).decode()
command = [
    "/bin/bash", "-c",
    f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py"
]

env = {
    "FEATHER_RUNTIME_MODE": "job",
    "HYDRA_BATCH_SIZE": "96",
    "HYDRA_TOTAL_BATCH": "196608",
    "HYDRA_USE_NEMOTRON": "1",
    "HYDRA_TARGET_SHARDS": "0",
    "HYDRA_BACKGROUND_PREFETCH": "0",
    "HYDRA_FORCE_HTM_CPU": "1",
    "HYDRA_INERT_MAMBA": "1",
    "HYDRA_FASTPATH": "0",
    "HYDRA_FUSED_SDR_PROJECT": "0",
    "HYDRA_HTM_FUSED": "0",
    "HYDRA_MODEL_COMPILE": "0",
    "HYDRA_MUON_COMPILE": "0",
    "PYTHONUNBUFFERED": "1",
    "HYDRA_RESUME_CKPT": "none",
    "HYDRA_CKPT_INTERVAL": "1000",
    "HYDRA_CKPT_ROTATIONS": "3",
    "HYDRA_HYENA_LAYERS": "0,1,2,3",
    "HYDRA_N_LAYER": "4",
    "TORCH_COMPILE_BACKEND": "eager",
    "DYNAMO_DISABLE": "1",
}

payload = {
    "spaceId": space_id,
    "command": command,
    "environment": env,
    "secrets": {"HF_TOKEN": token},
    "flavor": "a10g-large",
    "timeout": "12h",
}

url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
r = requests.post(url, json=payload, headers=headers)
print(f"status={r.status_code}")
print(r.text)
if r.status_code != 200:
    raise SystemExit(1)