icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
import os
import requests
import json
from pathlib import Path
# Use the environment variables from the task
token = os.environ.get("HF_TOKEN")
namespace = "GAInTech"
space_id = "GAInTech/feather-a10g-pretrain-v2-1778241600"
flavor = "a10g-large"
# Constructed environment - minimal for testing first
env = {
"FEATHER_RUNTIME_MODE": "job",
"HYDRA_BATCH_SIZE": "96",
"HYDRA_TOTAL_BATCH": "196608",
"HYDRA_USE_NEMOTRON": "1",
"HYDRA_TARGET_SHARDS": "0",
"HYDRA_FORCE_HTM_CPU": "1",
"HYDRA_INERT_MAMBA": "1"
}
# The payload that actually works for space-backed jobs
payload = {
"spaceId": space_id,
"command": ["/bin/bash", "-c", "python /app/entrypoint.py"],
"env": env,
"secrets": {"HF_TOKEN": token},
"flavor": flavor,
"timeout": "12h"
}
url = f"https://huggingface.co/api/jobs/{namespace}"
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
print(f"POST {url}")
# Sending with spaceId (CamelCase)
resp = requests.post(url, json=payload, headers=headers)
print(f"Status: {resp.status_code}")
print(f"Response: {resp.text}")
if resp.status_code != 200:
# Fallback to space_id (snake_case)
payload.pop("spaceId")
payload["space_id"] = space_id
print("Retrying with space_id...")
resp = requests.post(url, json=payload, headers=headers)
print(f"Status: {resp.status_code}")
print(f"Response: {resp.text}")