#!/usr/bin/env python3 import json import os import sys import time import requests from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) # Reuse the existing launcher logic for setup import scripts.launch_feather_hf_job as original_launcher def main(): token = original_launcher.require_token() routing = original_launcher.resolve_routing(token=token) # Sync overlay and wait for space if os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') != '1': original_launcher.sync_overlay_from_repo() from huggingface_hub import HfApi api = HfApi(token=token) print(f"[launch] uploading folder to {routing.space_repo}...") api.upload_folder( repo_id=routing.space_repo, repo_type='space', folder_path=str(original_launcher.IMAGE_DIR), commit_message='Update Feather pretrain runtime', token=token ) print("[launch] waiting for Space build...") original_launcher.wait_for_space(api, routing.space_repo) # Prepare env env = { 'HF_REPO_ID': routing.output_repo, 'FEATHER_HF_OWNER': routing.owner, 'FEATHER_HF_SPACE_REPO': routing.space_repo, 'FEATHER_HF_OUTPUT_REPO': routing.output_repo, 'FEATHER_HF_RETINA_CACHE_REPO': routing.retina_cache_repo, 'HYDRA_RETINA_CACHE_REPO': routing.retina_cache_repo, 'HYDRA_TARGET_SHARDS': original_launcher.TARGET_SHARDS, 'HYDRA_TIME_BUDGET': original_launcher.TIME_BUDGET, 'PYTHONUNBUFFERED': '1', 'FEATHER_RUNTIME_MODE': 'job', 'FEATHER_GPU_PROFILE': original_launcher.GPU_PROFILE, 'FEATHER_HF_FLAVOR': original_launcher.GPU_FLAVOR, 'HTM_CUDA_ARCH': original_launcher.HTM_CUDA_ARCH, 'TORCH_CUDA_ARCH_LIST': original_launcher.TORCH_CUDA_ARCH, 'TRITON_CACHE_DIR': f'/workspace/triton_cache/{original_launcher.GPU_PROFILE}', 'TRITON_CACHE_REPO': f'{routing.owner}/feather-triton-cache-{original_launcher.GPU_PROFILE}', } # Apply A10 defaults part of original_launcher if original_launcher.GPU_FLAVOR.startswith('a10'): _a10_defaults = { 'HYDRA_MUON_COMPILE': '0', 'HYDRA_FORCE_HTM_CPU': '1', 'HYDRA_INERT_MAMBA': '1', 'HYDRA_ALLOW_SYNTHETIC_RETINA': '1', 'HYDRA_FASTPATH': '1', 'HYDRA_FUSED_SDR_PROJECT': '0', 'HYDRA_HTM_FUSED': '0', 'HYDRA_BACKGROUND_PREFETCH': '0', } for k, v in _a10_defaults.items(): if k in os.environ: env[k] = os.environ[k] else: env.setdefault(k, v) # Passthrough for k, v in os.environ.items(): if (k.startswith('HYDRA_') or k.startswith('FEATHER_')) and k not in env: env[k] = v # Payload for REST API payload = { "spaceId": routing.space_repo, "command": ["/bin/bash", "-c", "python /app/entrypoint.py"], "env": env, "secrets": {"HF_TOKEN": token}, "flavor": original_launcher.GPU_FLAVOR, "timeout": original_launcher.TIMEOUT, } print(f"[launch] submitting HF Job on {original_launcher.GPU_FLAVOR} via REST...") url = f"https://huggingface.co/api/jobs/{routing.job_namespace}" headers = {"Authorization": f"Bearer {token}"} resp = requests.post(url, json=payload, headers=headers) if resp.status_code != 200: print(f"[error] {resp.status_code}: {resp.text}") return 1 job_data = resp.json() print(f"[launch] submitted job_id={job_data['id']} status={job_data['status']['stage']} url={job_data['url']}") return 0 if __name__ == '__main__': sys.exit(main())