feather-a10g-large-runtime / overlay /scripts /final_launch_rest.py
icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
import os
import requests
import sys
token = os.environ.get("HF_TOKEN")
if not token:
print("HF_TOKEN missing")
sys.exit(1)
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-gt80k-runtime-public"
# Critical Trajectory: B=96, Stream-only, A10G-compatible
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1",
"HYDRA_FASTPATH": "1",
"HYDRA_ALLOW_SYNTHETIC_RETINA": "1",
"HYDRA_FUSED_SDR_PROJECT": "0",
"HYDRA_HTM_FUSED": "0",
"HYDRA_MUON_COMPILE": "0",
"PYTHONUNBUFFERED": "1"
}
payload = {
"spaceId": space_id,
"command": ["/bin/bash", "-lc", "python /app/entrypoint.py"],
"environment": env,
"secrets": {"HF_TOKEN": token},
"flavor": "a10g-large",
"timeout": "12h"
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {"Authorization": f"Bearer {token}"}
print(f"Submitting to {url} with spaceId + environment...")
r = requests.post(url, json=payload, headers=headers)
print(f"Status: {r.status_code}")
if r.status_code == 200:
d = r.json()
print(f"Success! Job ID: {d['id']} Status: {d['status']['stage']}")
else:
print(f"Error: {r.text}")