File size: 6,365 Bytes
0cfefd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""推送代码到 HF Space,做 sandbox 微训练。

依据 ``estimate_memory.py`` 的估算:
  - BS=8 + bf16 + PCGrad + GradNorm 需要 ≥34 GB 显存;
  - 默认硬件 **a10g-small**(~24 GB):与 ``smoke_train`` / ``sandbox_real_data`` 的 tiny 设置一致;
  - 要拉满 BS=8 可改用 ``--gpu a10g-large`` 或 A100。

本脚本:
  1. ``huggingface_hub.create_repo`` 在 HF 上创建(或复用)一个 Space,
     Space SDK = ``docker``;
  2. 用 ``upload_folder`` 上传当前仓库(排除 ``.venv``、数据集等);
  3. 写入 ``Dockerfile`` + ``app.py``(在 Space 启动时跑微训练)。

要求:先在本地 ``hf auth login``。
"""

from __future__ import annotations

import argparse
from pathlib import Path

from huggingface_hub import HfApi, create_repo

ROOT = Path(__file__).resolve().parent.parent
DOCKERFILE = """FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
RUN apt-get update && apt-get install -y --no-install-recommends \\
    python3 python3-pip python3-venv ffmpeg libgl1 libglib2.0-0 git \\
    && rm -rf /var/lib/apt/lists/*

# HF Space 默认用户(避免权限问题)
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH

WORKDIR /app
COPY --chown=user pyproject.toml /app/
COPY --chown=user src /app/src
COPY --chown=user scripts /app/scripts
COPY --chown=user configs /app/configs
COPY --chown=user dinov3-vitb16-pretrain-lvd1689m /app/dinov3-vitb16-pretrain-lvd1689m
COPY --chown=user app.py /app/app.py

RUN python3 -m pip install --user --no-cache-dir --upgrade pip \\
    && python3 -m pip install --user --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu124 \\
    && python3 -m pip install --user --no-cache-dir -e .

EXPOSE 7860
CMD ["python3", "app.py"]
"""

APP_PY = '''"""HF Sandbox 入口(docker SDK,监听 7860)。

启动后:
  1. 后台进程跑 scripts/smoke_train.py(追加写入 /tmp/wjad.log)
  2. 主进程开 HTTP server on :7860,返回最新日志

阶段 A(无需数据):smoke_train 用随机张量验证 GPU 上的 forward/反传/AMP/PCGrad。
阶段 B(需要数据):把 LAUNCH_CMD 改为 runner_local 的真实训练命令。
"""
import os
import subprocess
import sys
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer

LOG_PATH = "/tmp/wjad.log"
PORT = 7860
# 当 SANDBOX_MODE=real_data 时跑真实标签 + 占位视频;否则跑随机张量 smoke。
_MODE = os.environ.get("SANDBOX_MODE", "smoke")
if _MODE == "real_data":
    LAUNCH_CMD = [sys.executable, "scripts/sandbox_real_data.py"]
else:
    LAUNCH_CMD = [sys.executable, "scripts/smoke_train.py"]


def _print_env(f):
    f.write("=" * 72 + "\\n")
    f.write(" WJAD HF Sandbox\\n")
    f.write("=" * 72 + "\\n")
    f.write(f"Python: {sys.version}\\n")
    try:
        import torch
        f.write(f"torch: {torch.__version__} cuda_avail={torch.cuda.is_available()}\\n")
        if torch.cuda.is_available():
            p = torch.cuda.get_device_properties(0)
            f.write(f"device: {p.name}  vram={p.total_memory / 1024**3:.2f} GB\\n")
    except Exception as e:
        f.write(f"torch import failed: {e}\\n")
    f.flush()


def run_training():
    with open(LOG_PATH, "w", buffering=1) as f:
        _print_env(f)
        f.write(f"$ {' '.join(LAUNCH_CMD)}\\n")
        f.flush()
        p = subprocess.Popen(
            LAUNCH_CMD, stdout=f, stderr=subprocess.STDOUT, cwd="/app"
        )
        rc = p.wait()
        f.write(f"\\n[exit code = {rc}]\\n")


class Handler(BaseHTTPRequestHandler):
    def do_GET(self):
        try:
            with open(LOG_PATH, "r") as f:
                body = f.read()
        except FileNotFoundError:
            body = "starting..."
        self.send_response(200)
        self.send_header("Content-Type", "text/plain; charset=utf-8")
        self.end_headers()
        self.wfile.write(body.encode("utf-8"))

    def log_message(self, fmt, *args):
        return


if __name__ == "__main__":
    threading.Thread(target=run_training, daemon=True).start()
    HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
'''


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo", required=True, help="HF Space repo, e.g. user/wjad-sandbox")
    parser.add_argument("--gpu", default="a10g-small", help="HF Spaces 硬件,默认 a10g-small(省 GPU 小时)")
    parser.add_argument("--private", action="store_true")
    parser.add_argument(
        "--mode",
        choices=["smoke", "real_data"],
        default="smoke",
        help="smoke=随机张量;real_data=拉真实标签+占位视频跑 trainer",
    )
    args = parser.parse_args()

    api = HfApi()
    print(f"[push_to_sandbox] 创建 / 复用 Space: {args.repo} (GPU={args.gpu}, mode={args.mode})")
    create_repo(
        args.repo,
        repo_type="space",
        space_sdk="docker",
        space_hardware=args.gpu,
        private=args.private,
        exist_ok=True,
    )

    # 把 SANDBOX_MODE 写到 Space 变量;HF_TOKEN 需要用户自己在 Space Settings
    # -> Secrets 里加一份能访问 NVIDIA 数据集的 token(real_data 模式必须)。
    api.add_space_variable(repo_id=args.repo, key="SANDBOX_MODE", value=args.mode)
    if args.mode == "real_data":
        print(
            "[push_to_sandbox] 提醒:real_data 模式需要在 Space Settings -> Secrets "
            "里手动添加 HF_TOKEN(必须是能访问 nvidia/PhysicalAI-Autonomous-Vehicle-"
            "Cosmos-Drive-Dreams 的账号 token,否则 download.py 会拒绝访问)。"
        )

    # 落盘 Dockerfile / app.py
    (ROOT / "Dockerfile").write_text(DOCKERFILE, encoding="utf-8")
    (ROOT / "app.py").write_text(APP_PY, encoding="utf-8")

    print("[push_to_sandbox] 上传仓库(排除 .venv / data / 缓存)...")
    api.upload_folder(
        folder_path=str(ROOT),
        repo_id=args.repo,
        repo_type="space",
        ignore_patterns=[
            ".venv/*",
            "data/*",
            "**/__pycache__/*",
            "*.pyc",
            "agent-tools/*",
            ".git/*",
        ],
    )
    print(f"[push_to_sandbox] OK -> https://huggingface.co/spaces/{args.repo}")


if __name__ == "__main__":
    main()