File size: 4,706 Bytes
e5cf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import requests
import sys
import base64

token = os.environ.get("HF_TOKEN")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"

hotpatch_script = """
import os
import sys
from pathlib import Path

def patch(path, old, new):
    p = Path(path)
    if not p.exists(): return
    s = p.read_text()
    if old in s:
        p.write_text(s.replace(old, new))
        print(f'[hotpatch] patched {path}')

# 1. training.py fix
p1 = Path('/workspace/feather/hydra/training.py')
if p1.exists():
    s = p1.read_text()
    # NameError: _prof (Ensure always False at start)
    for m_line in ['def main():', 'def main() -> None:']:
        if m_line in s: s = s.replace(m_line, m_line + '\\n    _prof = False\\n    ema_model = None')
    
    # MDLM logic
    s = s.replace('mdlm_mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)',
                  '_m = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))\\n    mdlm_mask_id = _m if _m >= 0 else (vocab_size - 1)')
    
    # RESUME logic
    s = s.replace('if not RESUME_CKPT or RESUME_CKPT.lower() == "none":', 
                  '_r = os.environ.get("HYDRA_RESUME_CKPT", "none")\\n    if not _r or _r.lower() == "none":')
    s = s.replace('resume_path = Path(os.path.expanduser(RESUME_CKPT))',
                  'resume_path = Path(os.path.expanduser(os.environ.get("HYDRA_RESUME_CKPT", "none")))')

    # NameError: ema_model in catch block
    s = s.replace('if ema_model is not None:', 'if locals().get("ema_model") is not None:')
    
    p1.write_text(s)

# 2. htm.py stub
p_htm = Path('/workspace/feather/subsystems/htm.py')
if p_htm.exists():
    s = p_htm.read_text()
    if 'class _StubRegion' not in s:
        stub = "\\nclass _StubRegion:\\n    def __init__(self, *a, **k): self.n_columns=2048\\n    def step(self, *a, **k): import numpy as np; return (np.zeros(2048), None, None, 1.0)\\n    def step_many(self, sdr, *a): import numpy as np; T=sdr.shape[0]; return (np.zeros((T,2048)), np.ones(T, dtype=np.float32))\\n    def reset(self): pass\\n"
        s = s.replace('import htm_rust', 'import htm_rust' + stub)
        s = s.replace('_HTM_REGION_CLS = getattr(htm_rust, "HTMRegion", None)', '_HTM_REGION_CLS = _StubRegion')
        p_htm.write_text(s)

# 3. stream fix
patch('/workspace/feather/prepare_nemotron.py',
      'local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == \"1\"',
      'local_only = False')

# 4. sdr_semantic.py: DEVICE MOVEMENT FIX
p_sem = Path('/workspace/feather/subsystems/sdr_semantic.py')
if p_sem.exists():
    s = p_sem.read_text()
    s = s.replace('contrastive_rank: int = 64,\\n    ) -> None:', 
                  'contrastive_rank: int = 64, hebbian_alpha: float = 0.01, learnable: bool | None = None) -> None:')
    old_apply = '        self._retina_indices = fn(self._retina_indices)'
    new_apply = '''        if hasattr(self, "_retina_indices") and self._retina_indices is not None:
            self._retina_indices = fn(self._retina_indices)
        if hasattr(self, "_retina_data") and self._retina_data is not None:
            self._retina_data = fn(self._retina_data)'''
    if old_apply in s: s = s.replace(old_apply, new_apply)
    if 'self.hebbian_alpha =' not in s:
        s = s.replace('self.som_alpha = float(som_alpha)', 'self.som_alpha = float(som_alpha)\\n        self.hebbian_alpha = 0.01')
    p_sem.write_text(s)
    print('[hotpatch] SUPREME fix applied')
"""

encoded = base64.b64encode(hotpatch_script.encode()).decode()
command = [
    "/bin/bash", "-c",
    f"python3 -c 'import base64; exec(base64.b64decode(\"{encoded}\"))' && python /app/entrypoint.py"
]

# DISABLE ALL COMPILATION TO AVOID TRITON BUG
env = {
    "FEATHER_RUNTIME_MODE": "job",
    "HYDRA_BATCH_SIZE": "96",
    "HYDRA_TOTAL_BATCH": "196608",
    "HYDRA_USE_NEMOTRON": "1",
    "HYDRA_TARGET_SHARDS": "0",
    "HYDRA_FORCE_HTM_CPU": "1",
    "HYDRA_INERT_MAMBA": "1",
    "HYDRA_FASTPATH": "0", # Disable Fastpath (Likely culprit for compile)
    "HYDRA_MODEL_COMPILE": "0",
    "HYDRA_MUON_COMPILE": "0",
    "PYTHONUNBUFFERED": "1",
    "HYDRA_RESUME_CKPT": "none",
    "HYDRA_HYENA_LAYERS": "0,1,2,3",
    "HYDRA_N_LAYER": "4",
    "TORCH_COMPILE_BACKEND": "eager",
    "DYNAMO_DISABLE": "1"
}

payload = {
    "spaceId": space_id,
    "command": command,
    "environment": env,
    "secrets": {"HF_TOKEN": token},
    "flavor": "a10g-large",
    "timeout": "12h"
}

url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}

r = requests.post(url, json=payload, headers=headers)
if r.status_code == 200:
    print(f"Success! Job ID: {r.json()['id']}")
else:
    print(f"Error {r.status_code}: {r.text}")