"""推送代码到 HF Space,做 sandbox 微训练。 依据 ``estimate_memory.py`` 的估算: - BS=8 + bf16 + PCGrad + GradNorm 需要 ≥34 GB 显存; - 默认硬件 **a10g-small**(~24 GB):与 ``smoke_train`` / ``sandbox_real_data`` 的 tiny 设置一致; - 要拉满 BS=8 可改用 ``--gpu a10g-large`` 或 A100。 本脚本: 1. ``huggingface_hub.create_repo`` 在 HF 上创建(或复用)一个 Space, Space SDK = ``docker``; 2. 用 ``upload_folder`` 上传当前仓库(排除 ``.venv``、数据集等); 3. 写入 ``Dockerfile`` + ``app.py``(在 Space 启动时跑微训练)。 要求:先在本地 ``hf auth login``。 """ from __future__ import annotations import argparse from pathlib import Path from huggingface_hub import HfApi, create_repo ROOT = Path(__file__).resolve().parent.parent DOCKERFILE = """FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 RUN apt-get update && apt-get install -y --no-install-recommends \\ python3 python3-pip python3-venv ffmpeg libgl1 libglib2.0-0 git \\ && rm -rf /var/lib/apt/lists/* # HF Space 默认用户(避免权限问题) RUN useradd -m -u 1000 user USER user ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH WORKDIR /app COPY --chown=user pyproject.toml /app/ COPY --chown=user src /app/src COPY --chown=user scripts /app/scripts COPY --chown=user configs /app/configs COPY --chown=user dinov3-vitb16-pretrain-lvd1689m /app/dinov3-vitb16-pretrain-lvd1689m COPY --chown=user app.py /app/app.py RUN python3 -m pip install --user --no-cache-dir --upgrade pip \\ && python3 -m pip install --user --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu124 \\ && python3 -m pip install --user --no-cache-dir -e . EXPOSE 7860 CMD ["python3", "app.py"] """ APP_PY = '''"""HF Sandbox 入口(docker SDK,监听 7860)。 启动后: 1. 后台进程跑 scripts/smoke_train.py(追加写入 /tmp/wjad.log) 2. 主进程开 HTTP server on :7860,返回最新日志 阶段 A(无需数据):smoke_train 用随机张量验证 GPU 上的 forward/反传/AMP/PCGrad。 阶段 B(需要数据):把 LAUNCH_CMD 改为 runner_local 的真实训练命令。 """ import os import subprocess import sys import threading from http.server import BaseHTTPRequestHandler, HTTPServer LOG_PATH = "/tmp/wjad.log" PORT = 7860 # 当 SANDBOX_MODE=real_data 时跑真实标签 + 占位视频;否则跑随机张量 smoke。 _MODE = os.environ.get("SANDBOX_MODE", "smoke") if _MODE == "real_data": LAUNCH_CMD = [sys.executable, "scripts/sandbox_real_data.py"] else: LAUNCH_CMD = [sys.executable, "scripts/smoke_train.py"] def _print_env(f): f.write("=" * 72 + "\\n") f.write(" WJAD HF Sandbox\\n") f.write("=" * 72 + "\\n") f.write(f"Python: {sys.version}\\n") try: import torch f.write(f"torch: {torch.__version__} cuda_avail={torch.cuda.is_available()}\\n") if torch.cuda.is_available(): p = torch.cuda.get_device_properties(0) f.write(f"device: {p.name} vram={p.total_memory / 1024**3:.2f} GB\\n") except Exception as e: f.write(f"torch import failed: {e}\\n") f.flush() def run_training(): with open(LOG_PATH, "w", buffering=1) as f: _print_env(f) f.write(f"$ {' '.join(LAUNCH_CMD)}\\n") f.flush() p = subprocess.Popen( LAUNCH_CMD, stdout=f, stderr=subprocess.STDOUT, cwd="/app" ) rc = p.wait() f.write(f"\\n[exit code = {rc}]\\n") class Handler(BaseHTTPRequestHandler): def do_GET(self): try: with open(LOG_PATH, "r") as f: body = f.read() except FileNotFoundError: body = "starting..." self.send_response(200) self.send_header("Content-Type", "text/plain; charset=utf-8") self.end_headers() self.wfile.write(body.encode("utf-8")) def log_message(self, fmt, *args): return if __name__ == "__main__": threading.Thread(target=run_training, daemon=True).start() HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() ''' def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--repo", required=True, help="HF Space repo, e.g. user/wjad-sandbox") parser.add_argument("--gpu", default="a10g-small", help="HF Spaces 硬件,默认 a10g-small(省 GPU 小时)") parser.add_argument("--private", action="store_true") parser.add_argument( "--mode", choices=["smoke", "real_data"], default="smoke", help="smoke=随机张量;real_data=拉真实标签+占位视频跑 trainer", ) args = parser.parse_args() api = HfApi() print(f"[push_to_sandbox] 创建 / 复用 Space: {args.repo} (GPU={args.gpu}, mode={args.mode})") create_repo( args.repo, repo_type="space", space_sdk="docker", space_hardware=args.gpu, private=args.private, exist_ok=True, ) # 把 SANDBOX_MODE 写到 Space 变量;HF_TOKEN 需要用户自己在 Space Settings # -> Secrets 里加一份能访问 NVIDIA 数据集的 token(real_data 模式必须)。 api.add_space_variable(repo_id=args.repo, key="SANDBOX_MODE", value=args.mode) if args.mode == "real_data": print( "[push_to_sandbox] 提醒:real_data 模式需要在 Space Settings -> Secrets " "里手动添加 HF_TOKEN(必须是能访问 nvidia/PhysicalAI-Autonomous-Vehicle-" "Cosmos-Drive-Dreams 的账号 token,否则 download.py 会拒绝访问)。" ) # 落盘 Dockerfile / app.py (ROOT / "Dockerfile").write_text(DOCKERFILE, encoding="utf-8") (ROOT / "app.py").write_text(APP_PY, encoding="utf-8") print("[push_to_sandbox] 上传仓库(排除 .venv / data / 缓存)...") api.upload_folder( folder_path=str(ROOT), repo_id=args.repo, repo_type="space", ignore_patterns=[ ".venv/*", "data/*", "**/__pycache__/*", "*.pyc", "agent-tools/*", ".git/*", ], ) print(f"[push_to_sandbox] OK -> https://huggingface.co/spaces/{args.repo}") if __name__ == "__main__": main()