| """推送代码到 HF Space,做 sandbox 微训练。 |
| |
| 依据 ``estimate_memory.py`` 的估算: |
| - BS=8 + bf16 + PCGrad + GradNorm 需要 ≥34 GB 显存; |
| - 默认硬件 **a10g-small**(~24 GB):与 ``smoke_train`` / ``sandbox_real_data`` 的 tiny 设置一致; |
| - 要拉满 BS=8 可改用 ``--gpu a10g-large`` 或 A100。 |
| |
| 本脚本: |
| 1. ``huggingface_hub.create_repo`` 在 HF 上创建(或复用)一个 Space, |
| Space SDK = ``docker``; |
| 2. 用 ``upload_folder`` 上传当前仓库(排除 ``.venv``、数据集等); |
| 3. 写入 ``Dockerfile`` + ``app.py``(在 Space 启动时跑微训练)。 |
| |
| 要求:先在本地 ``hf auth login``。 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi, create_repo |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| DOCKERFILE = """FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 |
| |
| ENV DEBIAN_FRONTEND=noninteractive |
| ENV PYTHONUNBUFFERED=1 |
| RUN apt-get update && apt-get install -y --no-install-recommends \\ |
| python3 python3-pip python3-venv ffmpeg libgl1 libglib2.0-0 git \\ |
| && rm -rf /var/lib/apt/lists/* |
| |
| # HF Space 默认用户(避免权限问题) |
| RUN useradd -m -u 1000 user |
| USER user |
| ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH |
| |
| WORKDIR /app |
| COPY --chown=user pyproject.toml /app/ |
| COPY --chown=user src /app/src |
| COPY --chown=user scripts /app/scripts |
| COPY --chown=user configs /app/configs |
| COPY --chown=user dinov3-vitb16-pretrain-lvd1689m /app/dinov3-vitb16-pretrain-lvd1689m |
| COPY --chown=user app.py /app/app.py |
| |
| RUN python3 -m pip install --user --no-cache-dir --upgrade pip \\ |
| && python3 -m pip install --user --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu124 \\ |
| && python3 -m pip install --user --no-cache-dir -e . |
| |
| EXPOSE 7860 |
| CMD ["python3", "app.py"] |
| """ |
|
|
| APP_PY = '''"""HF Sandbox 入口(docker SDK,监听 7860)。 |
| |
| 启动后: |
| 1. 后台进程跑 scripts/smoke_train.py(追加写入 /tmp/wjad.log) |
| 2. 主进程开 HTTP server on :7860,返回最新日志 |
| |
| 阶段 A(无需数据):smoke_train 用随机张量验证 GPU 上的 forward/反传/AMP/PCGrad。 |
| 阶段 B(需要数据):把 LAUNCH_CMD 改为 runner_local 的真实训练命令。 |
| """ |
| import os |
| import subprocess |
| import sys |
| import threading |
| from http.server import BaseHTTPRequestHandler, HTTPServer |
| |
| LOG_PATH = "/tmp/wjad.log" |
| PORT = 7860 |
| # 当 SANDBOX_MODE=real_data 时跑真实标签 + 占位视频;否则跑随机张量 smoke。 |
| _MODE = os.environ.get("SANDBOX_MODE", "smoke") |
| if _MODE == "real_data": |
| LAUNCH_CMD = [sys.executable, "scripts/sandbox_real_data.py"] |
| else: |
| LAUNCH_CMD = [sys.executable, "scripts/smoke_train.py"] |
| |
| |
| def _print_env(f): |
| f.write("=" * 72 + "\\n") |
| f.write(" WJAD HF Sandbox\\n") |
| f.write("=" * 72 + "\\n") |
| f.write(f"Python: {sys.version}\\n") |
| try: |
| import torch |
| f.write(f"torch: {torch.__version__} cuda_avail={torch.cuda.is_available()}\\n") |
| if torch.cuda.is_available(): |
| p = torch.cuda.get_device_properties(0) |
| f.write(f"device: {p.name} vram={p.total_memory / 1024**3:.2f} GB\\n") |
| except Exception as e: |
| f.write(f"torch import failed: {e}\\n") |
| f.flush() |
| |
| |
| def run_training(): |
| with open(LOG_PATH, "w", buffering=1) as f: |
| _print_env(f) |
| f.write(f"$ {' '.join(LAUNCH_CMD)}\\n") |
| f.flush() |
| p = subprocess.Popen( |
| LAUNCH_CMD, stdout=f, stderr=subprocess.STDOUT, cwd="/app" |
| ) |
| rc = p.wait() |
| f.write(f"\\n[exit code = {rc}]\\n") |
| |
| |
| class Handler(BaseHTTPRequestHandler): |
| def do_GET(self): |
| try: |
| with open(LOG_PATH, "r") as f: |
| body = f.read() |
| except FileNotFoundError: |
| body = "starting..." |
| self.send_response(200) |
| self.send_header("Content-Type", "text/plain; charset=utf-8") |
| self.end_headers() |
| self.wfile.write(body.encode("utf-8")) |
| |
| def log_message(self, fmt, *args): |
| return |
| |
| |
| if __name__ == "__main__": |
| threading.Thread(target=run_training, daemon=True).start() |
| HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() |
| ''' |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--repo", required=True, help="HF Space repo, e.g. user/wjad-sandbox") |
| parser.add_argument("--gpu", default="a10g-small", help="HF Spaces 硬件,默认 a10g-small(省 GPU 小时)") |
| parser.add_argument("--private", action="store_true") |
| parser.add_argument( |
| "--mode", |
| choices=["smoke", "real_data"], |
| default="smoke", |
| help="smoke=随机张量;real_data=拉真实标签+占位视频跑 trainer", |
| ) |
| args = parser.parse_args() |
|
|
| api = HfApi() |
| print(f"[push_to_sandbox] 创建 / 复用 Space: {args.repo} (GPU={args.gpu}, mode={args.mode})") |
| create_repo( |
| args.repo, |
| repo_type="space", |
| space_sdk="docker", |
| space_hardware=args.gpu, |
| private=args.private, |
| exist_ok=True, |
| ) |
|
|
| |
| |
| api.add_space_variable(repo_id=args.repo, key="SANDBOX_MODE", value=args.mode) |
| if args.mode == "real_data": |
| print( |
| "[push_to_sandbox] 提醒:real_data 模式需要在 Space Settings -> Secrets " |
| "里手动添加 HF_TOKEN(必须是能访问 nvidia/PhysicalAI-Autonomous-Vehicle-" |
| "Cosmos-Drive-Dreams 的账号 token,否则 download.py 会拒绝访问)。" |
| ) |
|
|
| |
| (ROOT / "Dockerfile").write_text(DOCKERFILE, encoding="utf-8") |
| (ROOT / "app.py").write_text(APP_PY, encoding="utf-8") |
|
|
| print("[push_to_sandbox] 上传仓库(排除 .venv / data / 缓存)...") |
| api.upload_folder( |
| folder_path=str(ROOT), |
| repo_id=args.repo, |
| repo_type="space", |
| ignore_patterns=[ |
| ".venv/*", |
| "data/*", |
| "**/__pycache__/*", |
| "*.pyc", |
| "agent-tools/*", |
| ".git/*", |
| ], |
| ) |
| print(f"[push_to_sandbox] OK -> https://huggingface.co/spaces/{args.repo}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|