File size: 3,799 Bytes
422445b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
import json
import os
import sys
import time
import requests
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Reuse the existing launcher logic for setup
import scripts.launch_feather_hf_job as original_launcher

def main():
    token = original_launcher.require_token()
    routing = original_launcher.resolve_routing(token=token)
    
    # Sync overlay and wait for space
    if os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') != '1':
        original_launcher.sync_overlay_from_repo()
        from huggingface_hub import HfApi
        api = HfApi(token=token)
        print(f"[launch] uploading folder to {routing.space_repo}...")
        api.upload_folder(
            repo_id=routing.space_repo,
            repo_type='space',
            folder_path=str(original_launcher.IMAGE_DIR),
            commit_message='Update Feather pretrain runtime',
            token=token
        )
        print("[launch] waiting for Space build...")
        original_launcher.wait_for_space(api, routing.space_repo)

    # Prepare env
    env = {
        'HF_REPO_ID': routing.output_repo,
        'FEATHER_HF_OWNER': routing.owner,
        'FEATHER_HF_SPACE_REPO': routing.space_repo,
        'FEATHER_HF_OUTPUT_REPO': routing.output_repo,
        'FEATHER_HF_RETINA_CACHE_REPO': routing.retina_cache_repo,
        'HYDRA_RETINA_CACHE_REPO': routing.retina_cache_repo,
        'HYDRA_TARGET_SHARDS': original_launcher.TARGET_SHARDS,
        'HYDRA_TIME_BUDGET': original_launcher.TIME_BUDGET,
        'PYTHONUNBUFFERED': '1',
        'FEATHER_RUNTIME_MODE': 'job',
        'FEATHER_GPU_PROFILE': original_launcher.GPU_PROFILE,
        'FEATHER_HF_FLAVOR': original_launcher.GPU_FLAVOR,
        'HTM_CUDA_ARCH': original_launcher.HTM_CUDA_ARCH,
        'TORCH_CUDA_ARCH_LIST': original_launcher.TORCH_CUDA_ARCH,
        'TRITON_CACHE_DIR': f'/workspace/triton_cache/{original_launcher.GPU_PROFILE}',
        'TRITON_CACHE_REPO': f'{routing.owner}/feather-triton-cache-{original_launcher.GPU_PROFILE}',
    }
    
    # Apply A10 defaults part of original_launcher
    if original_launcher.GPU_FLAVOR.startswith('a10'):
        _a10_defaults = {
            'HYDRA_MUON_COMPILE': '0',
            'HYDRA_FORCE_HTM_CPU': '1',
            'HYDRA_INERT_MAMBA': '1',
            'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
            'HYDRA_FASTPATH': '1',
            'HYDRA_FUSED_SDR_PROJECT': '0',
            'HYDRA_HTM_FUSED': '0',
            'HYDRA_BACKGROUND_PREFETCH': '0',
        }
        for k, v in _a10_defaults.items():
            if k in os.environ: env[k] = os.environ[k]
            else: env.setdefault(k, v)

    # Passthrough
    for k, v in os.environ.items():
        if (k.startswith('HYDRA_') or k.startswith('FEATHER_')) and k not in env:
            env[k] = v

    # Payload for REST API
    payload = {
        "spaceId": routing.space_repo,
        "command": ["/bin/bash", "-c", "python /app/entrypoint.py"],
        "env": env,
        "secrets": {"HF_TOKEN": token},
        "flavor": original_launcher.GPU_FLAVOR,
        "timeout": original_launcher.TIMEOUT,
    }

    print(f"[launch] submitting HF Job on {original_launcher.GPU_FLAVOR} via REST...")
    url = f"https://huggingface.co/api/jobs/{routing.job_namespace}"
    headers = {"Authorization": f"Bearer {token}"}
    
    resp = requests.post(url, json=payload, headers=headers)
    if resp.status_code != 200:
        print(f"[error] {resp.status_code}: {resp.text}")
        return 1
        
    job_data = resp.json()
    print(f"[launch] submitted job_id={job_data['id']} status={job_data['status']['stage']} url={job_data['url']}")
    return 0

if __name__ == '__main__':
    sys.exit(main())