File size: 6,365 Bytes
0cfefd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """推送代码到 HF Space,做 sandbox 微训练。
依据 ``estimate_memory.py`` 的估算:
- BS=8 + bf16 + PCGrad + GradNorm 需要 ≥34 GB 显存;
- 默认硬件 **a10g-small**(~24 GB):与 ``smoke_train`` / ``sandbox_real_data`` 的 tiny 设置一致;
- 要拉满 BS=8 可改用 ``--gpu a10g-large`` 或 A100。
本脚本:
1. ``huggingface_hub.create_repo`` 在 HF 上创建(或复用)一个 Space,
Space SDK = ``docker``;
2. 用 ``upload_folder`` 上传当前仓库(排除 ``.venv``、数据集等);
3. 写入 ``Dockerfile`` + ``app.py``(在 Space 启动时跑微训练)。
要求:先在本地 ``hf auth login``。
"""
from __future__ import annotations
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo
ROOT = Path(__file__).resolve().parent.parent
DOCKERFILE = """FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
RUN apt-get update && apt-get install -y --no-install-recommends \\
python3 python3-pip python3-venv ffmpeg libgl1 libglib2.0-0 git \\
&& rm -rf /var/lib/apt/lists/*
# HF Space 默认用户(避免权限问题)
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
WORKDIR /app
COPY --chown=user pyproject.toml /app/
COPY --chown=user src /app/src
COPY --chown=user scripts /app/scripts
COPY --chown=user configs /app/configs
COPY --chown=user dinov3-vitb16-pretrain-lvd1689m /app/dinov3-vitb16-pretrain-lvd1689m
COPY --chown=user app.py /app/app.py
RUN python3 -m pip install --user --no-cache-dir --upgrade pip \\
&& python3 -m pip install --user --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu124 \\
&& python3 -m pip install --user --no-cache-dir -e .
EXPOSE 7860
CMD ["python3", "app.py"]
"""
APP_PY = '''"""HF Sandbox 入口(docker SDK,监听 7860)。
启动后:
1. 后台进程跑 scripts/smoke_train.py(追加写入 /tmp/wjad.log)
2. 主进程开 HTTP server on :7860,返回最新日志
阶段 A(无需数据):smoke_train 用随机张量验证 GPU 上的 forward/反传/AMP/PCGrad。
阶段 B(需要数据):把 LAUNCH_CMD 改为 runner_local 的真实训练命令。
"""
import os
import subprocess
import sys
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer
LOG_PATH = "/tmp/wjad.log"
PORT = 7860
# 当 SANDBOX_MODE=real_data 时跑真实标签 + 占位视频;否则跑随机张量 smoke。
_MODE = os.environ.get("SANDBOX_MODE", "smoke")
if _MODE == "real_data":
LAUNCH_CMD = [sys.executable, "scripts/sandbox_real_data.py"]
else:
LAUNCH_CMD = [sys.executable, "scripts/smoke_train.py"]
def _print_env(f):
f.write("=" * 72 + "\\n")
f.write(" WJAD HF Sandbox\\n")
f.write("=" * 72 + "\\n")
f.write(f"Python: {sys.version}\\n")
try:
import torch
f.write(f"torch: {torch.__version__} cuda_avail={torch.cuda.is_available()}\\n")
if torch.cuda.is_available():
p = torch.cuda.get_device_properties(0)
f.write(f"device: {p.name} vram={p.total_memory / 1024**3:.2f} GB\\n")
except Exception as e:
f.write(f"torch import failed: {e}\\n")
f.flush()
def run_training():
with open(LOG_PATH, "w", buffering=1) as f:
_print_env(f)
f.write(f"$ {' '.join(LAUNCH_CMD)}\\n")
f.flush()
p = subprocess.Popen(
LAUNCH_CMD, stdout=f, stderr=subprocess.STDOUT, cwd="/app"
)
rc = p.wait()
f.write(f"\\n[exit code = {rc}]\\n")
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
try:
with open(LOG_PATH, "r") as f:
body = f.read()
except FileNotFoundError:
body = "starting..."
self.send_response(200)
self.send_header("Content-Type", "text/plain; charset=utf-8")
self.end_headers()
self.wfile.write(body.encode("utf-8"))
def log_message(self, fmt, *args):
return
if __name__ == "__main__":
threading.Thread(target=run_training, daemon=True).start()
HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
'''
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--repo", required=True, help="HF Space repo, e.g. user/wjad-sandbox")
parser.add_argument("--gpu", default="a10g-small", help="HF Spaces 硬件,默认 a10g-small(省 GPU 小时)")
parser.add_argument("--private", action="store_true")
parser.add_argument(
"--mode",
choices=["smoke", "real_data"],
default="smoke",
help="smoke=随机张量;real_data=拉真实标签+占位视频跑 trainer",
)
args = parser.parse_args()
api = HfApi()
print(f"[push_to_sandbox] 创建 / 复用 Space: {args.repo} (GPU={args.gpu}, mode={args.mode})")
create_repo(
args.repo,
repo_type="space",
space_sdk="docker",
space_hardware=args.gpu,
private=args.private,
exist_ok=True,
)
# 把 SANDBOX_MODE 写到 Space 变量;HF_TOKEN 需要用户自己在 Space Settings
# -> Secrets 里加一份能访问 NVIDIA 数据集的 token(real_data 模式必须)。
api.add_space_variable(repo_id=args.repo, key="SANDBOX_MODE", value=args.mode)
if args.mode == "real_data":
print(
"[push_to_sandbox] 提醒:real_data 模式需要在 Space Settings -> Secrets "
"里手动添加 HF_TOKEN(必须是能访问 nvidia/PhysicalAI-Autonomous-Vehicle-"
"Cosmos-Drive-Dreams 的账号 token,否则 download.py 会拒绝访问)。"
)
# 落盘 Dockerfile / app.py
(ROOT / "Dockerfile").write_text(DOCKERFILE, encoding="utf-8")
(ROOT / "app.py").write_text(APP_PY, encoding="utf-8")
print("[push_to_sandbox] 上传仓库(排除 .venv / data / 缓存)...")
api.upload_folder(
folder_path=str(ROOT),
repo_id=args.repo,
repo_type="space",
ignore_patterns=[
".venv/*",
"data/*",
"**/__pycache__/*",
"*.pyc",
"agent-tools/*",
".git/*",
],
)
print(f"[push_to_sandbox] OK -> https://huggingface.co/spaces/{args.repo}")
if __name__ == "__main__":
main()
|