Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

configs/cosmos_hub_extract_lidar_and_generation_only.txt +6 -0
configs/cosmos_hub_replace_lidar_and_generation.txt +5 -0
scripts/jobs_extract_archives.py +199 -4
scripts/push_to_jobs.py +92 -32

configs/cosmos_hub_extract_lidar_and_generation_only.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+# 三个并行 Job：① lidar_raw ② pose（磁盘问题重试）③ single_view 内仅 generation（分卷），不碰 caption/hdmap
+# extract-parallel --replace-subdirs-file ... --shards-file 本文件
+lidar_raw
+pose
+cosmos_synthetic/single_view|only=generation

configs/cosmos_hub_replace_lidar_and_generation.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# 重试前仅删除下列 extracted 子树，其它不动（路径相对 extracted-subpath，默认 extracted/cosmos_hub）
+lidar_raw
+pose
+cosmos_synthetic/single_view/generation

scripts/jobs_extract_archives.py CHANGED Viewed

@@ -7,6 +7,15 @@
 解压产物只写在 ``--out-root`` 下；**不在 mirror 里落任何文件**（进度标记也放在
 ``out-root/_wjad_extract_state/``，与归档相对路径对应，避免修改 ``--scan-root``）。
 示例（mount ``hf://buckets/.../WJAD`` → ``/mnt/wjad``）::
     python scripts/jobs_extract_archives.py \\
@@ -17,10 +26,144 @@
 from __future__ import annotations
 import argparse
 import tarfile
 import zipfile
 from pathlib import Path
 def _archive_stem(path: Path) -> str:
     n = path.name
@@ -42,9 +185,6 @@ def _is_archive(path: Path) -> bool:
     )
-STATE_DIRNAME = "_wjad_extract_state"
 def _done_marker_path(archive: Path, scan: Path, out_root: Path) -> Path:
     """标记只写在 out_root 下，绝不写回 mirror。"""
     rel = archive.relative_to(scan)
@@ -87,31 +227,86 @@ def _extract_one(archive: Path, dest_dir: Path) -> None:
             tf.extractall(dest_dir)
 def main() -> None:
     p = argparse.ArgumentParser()
     p.add_argument("--scan-root", type=Path, required=True)
     p.add_argument("--out-root", type=Path, required=True)
     args = p.parse_args()
     scan: Path = args.scan_root.resolve()
     out_root: Path = args.out_root.resolve()
     if not scan.is_dir():
         raise SystemExit(f"--scan-root not a directory: {scan}")
     _validate_roots(scan, out_root)
     count = 0
     for path in sorted(scan.rglob("*")):
         if not path.is_file() or not _is_archive(path):
             continue
         mpath = _done_marker_path(path, scan, out_root)
         if mpath.exists():
             continue
         rel_parent = path.parent.relative_to(scan)
-        dest = out_root / rel_parent / _archive_stem(path)
         print(f"[extract] {path.relative_to(scan)} -> {dest.relative_to(out_root)}", flush=True)
         _extract_one(path, dest)
         mpath.parent.mkdir(parents=True, exist_ok=True)
         mpath.write_text("ok\n", encoding="utf-8")
         count += 1
     print(f"[extract] done, {count} archives", flush=True)

 解压产物只写在 ``--out-root`` 下；**不在 mirror 里落任何文件**（进度标记也放在
 ``out-root/_wjad_extract_state/``，与归档相对路径对应，避免修改 ``--scan-root``）。
+在 HF Job 内若设置了 ``WJAD_CACHE_ROOT`` / ``TMPDIR``（见 ``push_to_jobs`` 的 cache 环境），
+本脚本会在**每解压完一个归档后**清空 ``TMPDIR`` 下的临时内容，降低容器 ephemeral 占用。
+**分卷归档**（与 Cosmos README 中 ``generation.tar.gz.part-*`` 一致）：同目录下若干片文件，
+按分卷序号排序后 **cat 拼成字节流**，再交给 ``tar`` 从 stdin 解压，**不先合并成大文件**（省磁盘）。
+可选 ``--only-stems a,b``（或环境变量 ``WJAD_EXTRACT_ONLY_STEMS``）：只解压这些「目标目录名」对应的归档
+（即 ``_archive_stem``：如 ``generation.tar.gz`` / 分卷逻辑名 ``generation.tar.gz`` → stem ``generation``）。
 示例（mount ``hf://buckets/.../WJAD`` → ``/mnt/wjad``）::
     python scripts/jobs_extract_archives.py \\
 from __future__ import annotations
 import argparse
+import gc
+import os
+import re
+import shutil
+import subprocess
 import tarfile
+import tempfile
+import threading
 import zipfile
+from collections import defaultdict
 from pathlib import Path
+STATE_DIRNAME = "_wjad_extract_state"
+def _ensure_tmp_on_bucket() -> None:
+    """尽量把 Python 临时目录指到挂载盘，减少容器根分区（ephemeral）占用。"""
+    root = os.environ.get("WJAD_CACHE_ROOT", "").strip()
+    if root:
+        tmp = Path(root) / "tmp"
+        tmp.mkdir(parents=True, exist_ok=True)
+        os.environ.setdefault("TMPDIR", str(tmp))
+        os.environ.setdefault("TEMP", str(tmp))
+        os.environ.setdefault("TMP", str(tmp))
+    td = os.environ.get("TMPDIR", "").strip()
+    if td:
+        tempfile.tempdir = td
+def _clean_tmpdir_after_archive() -> None:
+    """每个归档解压后清空 TMPDIR 内容，避免 zip/tar/解压链累积临时文件撑爆本地盘。"""
+    raw = os.environ.get("TMPDIR", "").strip()
+    if not raw:
+        c = os.environ.get("WJAD_CACHE_ROOT", "").strip()
+        if c:
+            raw = str(Path(c) / "tmp")
+    if not raw:
+        return
+    tmp = Path(raw)
+    if not tmp.is_dir():
+        return
+    for child in list(tmp.iterdir()):
+        try:
+            if child.is_dir():
+                shutil.rmtree(child, ignore_errors=True)
+            else:
+                child.unlink(missing_ok=True)
+        except OSError:
+            pass
+    gc.collect()
+# 例如 generation.tar.gz.part-aa → 逻辑名 generation.tar.gz，分卷键 aa
+_SPLIT_PART_RE = re.compile(r"(?i)^(?P<base>.+)\.part-(?P<suf>[a-z0-9]+)$")
+def _split_part_info(path: Path) -> tuple[str, str] | None:
+    m = _SPLIT_PART_RE.match(path.name)
+    if not m:
+        return None
+    return m.group("base"), m.group("suf")
+def _split_part_sort_key(path: Path) -> tuple[int, int | str]:
+    info = _split_part_info(path)
+    if not info:
+        return (2, "")
+    suf = info[1]
+    if suf.isdigit():
+        return (0, int(suf))
+    return (1, suf)
+def _collect_split_groups(scan: Path) -> dict[tuple[Path, str], list[Path]]:
+    """同一目录、同一逻辑归档名的分卷归为一组。"""
+    groups: dict[tuple[Path, str], list[Path]] = defaultdict(list)
+    for path in scan.rglob("*"):
+        if not path.is_file():
+            continue
+        info = _split_part_info(path)
+        if info:
+            logical, _suf = info
+            groups[(path.parent.resolve(), logical)].append(path)
+    return dict(groups)
+def _tar_stdin_args(logical_name: str, dest_dir: Path) -> list[str]:
+    """构造「从 stdin 读压缩 tar」的 tar 参数（依赖系统 tar，与 HF python 镜像一致）。"""
+    lower = logical_name.lower()
+    dest = str(dest_dir)
+    if lower.endswith((".tar.gz", ".tgz")):
+        return ["tar", "-xzf", "-", "-C", dest]
+    if lower.endswith(".tar.bz2"):
+        return ["tar", "-xjf", "-", "-C", dest]
+    if lower.endswith(".tar.xz"):
+        return ["tar", "-xJf", "-", "-C", dest]
+    if lower.endswith(".tar"):
+        return ["tar", "-xf", "-", "-C", dest]
+    raise ValueError(f"不支持的拼接分卷类型（仅 tar / tar.*）: {logical_name!r}")
+def _extract_split_volumes(parts: list[Path], logical_name: str, dest_dir: Path) -> None:
+    """顺序读分卷写入 tar stdin，流式解压，不合并落地文件（POSIX / Windows 均可）。"""
+    if not parts:
+        raise ValueError("分卷列表为空")
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    tar_args = _tar_stdin_args(logical_name, dest_dir)
+    proc = subprocess.Popen(
+        tar_args,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    assert proc.stdin is not None
+    def _feed() -> None:
+        try:
+            for p in parts:
+                with p.open("rb") as f:
+                    shutil.copyfileobj(f, proc.stdin, length=1024 * 1024 * 8)
+        finally:
+            try:
+                proc.stdin.close()
+            except OSError:
+                pass
+    feeder = threading.Thread(target=_feed, daemon=True)
+    feeder.start()
+    _out, terr = proc.communicate()
+    feeder.join(timeout=30)
+    if proc.returncode != 0:
+        msg = (terr or b"").decode(errors="replace")
+        raise RuntimeError(f"tar 解压分卷流失败 (exit {proc.returncode}): {msg}")
+def _done_marker_split(scan: Path, out_root: Path, group_parent: Path, logical_name: str) -> Path:
+    rel_parent = group_parent.resolve().relative_to(scan)
+    return out_root / STATE_DIRNAME / rel_parent / f"{logical_name}.wjad_done"
 def _archive_stem(path: Path) -> str:
     n = path.name
     )
 def _done_marker_path(archive: Path, scan: Path, out_root: Path) -> Path:
     """标记只写在 out_root 下，绝不写回 mirror。"""
     rel = archive.relative_to(scan)
             tf.extractall(dest_dir)
+def _only_stems_set(args: argparse.Namespace) -> set[str] | None:
+    raw = (getattr(args, "only_stems", None) or "").strip()
+    if not raw:
+        raw = os.environ.get("WJAD_EXTRACT_ONLY_STEMS", "").strip()
+    if not raw:
+        return None
+    stems = {x.strip() for x in raw.split(",") if x.strip()}
+    return stems or None
+def _stem_ok(only: set[str] | None, stem: str) -> bool:
+    return only is None or stem in only
 def main() -> None:
     p = argparse.ArgumentParser()
     p.add_argument("--scan-root", type=Path, required=True)
     p.add_argument("--out-root", type=Path, required=True)
+    p.add_argument(
+        "--only-stems",
+        default=None,
+        help="逗号分隔，只解压这些 stem（与解压目标目录名一致，如 generation）",
+    )
     args = p.parse_args()
+    only = _only_stems_set(args)
     scan: Path = args.scan_root.resolve()
     out_root: Path = args.out_root.resolve()
     if not scan.is_dir():
         raise SystemExit(f"--scan-root not a directory: {scan}")
     _validate_roots(scan, out_root)
+    _ensure_tmp_on_bucket()
+    split_groups = _collect_split_groups(scan)
+    part_paths: set[Path] = {p for plist in split_groups.values() for p in plist}
     count = 0
     for path in sorted(scan.rglob("*")):
         if not path.is_file() or not _is_archive(path):
             continue
+        if path in part_paths:
+            continue
+        stem = _archive_stem(path)
+        if not _stem_ok(only, stem):
+            continue
         mpath = _done_marker_path(path, scan, out_root)
         if mpath.exists():
             continue
         rel_parent = path.parent.relative_to(scan)
+        dest = out_root / rel_parent / stem
         print(f"[extract] {path.relative_to(scan)} -> {dest.relative_to(out_root)}", flush=True)
         _extract_one(path, dest)
+        _clean_tmpdir_after_archive()
         mpath.parent.mkdir(parents=True, exist_ok=True)
         mpath.write_text("ok\n", encoding="utf-8")
         count += 1
+    for (gparent, logical) in sorted(
+        split_groups,
+        key=lambda k: (str(k[0].resolve().relative_to(scan)), k[1].lower()),
+    ):
+        stem = _archive_stem(Path(logical))
+        if not _stem_ok(only, stem):
+            continue
+        parts_raw = split_groups[(gparent, logical)]
+        parts_sorted = sorted(parts_raw, key=_split_part_sort_key)
+        mpath = _done_marker_split(scan, out_root, gparent, logical)
+        if mpath.exists():
+            continue
+        rel_parent = gparent.resolve().relative_to(scan)
+        dest = out_root / rel_parent / stem
+        print(
+            f"[extract-split] {logical} ({len(parts_sorted)} parts) -> {dest.relative_to(out_root)}",
+            flush=True,
+        )
+        _extract_split_volumes(parts_sorted, logical, dest)
+        _clean_tmpdir_after_archive()
+        mpath.parent.mkdir(parents=True, exist_ok=True)
+        mpath.write_text("ok\n", encoding="utf-8")
+        count += 1
     print(f"[extract] done, {count} archives", flush=True)

scripts/push_to_jobs.py CHANGED Viewed

@@ -19,7 +19,8 @@ Bucket 目录约定（相对挂载根）::
 - ``inspect-extract-job <job_id>`` — 从 ``hf jobs inspect --json`` 中解析 ``--scan-root``，列出该解压 Job 负责的顶层文件夹名。
 - ``extract-parallel`` — 多个并行解压 Job；**合并**为各子目录在 ``extracted/.../<名>/`` 下并列。
   ``--shard-dirs a,b,c`` 表示 **3 个 Job**（各处理一个顶层子文件夹）。
-  ``--shards-file`` 中 **每一行一个 Job**；行内用逗号分隔的多个目录会在 **同一 Job 内顺序解压**（例如 6 行 → 6 个并行 Job）。
 单独步骤可加 ``--detach``。Windows 上会使用 ``sys.executable`` 同目录的 ``hf.exe``。
 """
@@ -98,14 +99,22 @@ def build_copy_cmd(args: argparse.Namespace) -> list[str]:
 def _bucket_cache_env_sh(cache_subpath: str) -> str:
-    """单行：pip / 临时目录 / HF 相关缓存全部落在挂载盘 cache/ 下。"""
     c = f"/mnt/wjad/{cache_subpath}"
     return (
         f"export WJAD_CACHE_ROOT={c} && "
-        f"mkdir -p {c}/pip {c}/tmp {c}/hf {c}/transformers {c}/torch {c}/datasets {c}/xdg && "
         f"export PIP_CACHE_DIR={c}/pip && export TMPDIR={c}/tmp && export TEMP={c}/tmp && export TMP={c}/tmp && "
         f"export HF_HOME={c}/hf && export TRANSFORMERS_CACHE={c}/transformers && export TORCH_HOME={c}/torch && "
-        f"export HF_DATASETS_CACHE={c}/datasets && export XDG_CACHE_HOME={c}/xdg"
     )
@@ -126,7 +135,7 @@ def _sanitize_clone_tag_for_group(group: list[str]) -> str:
 def _safe_rel_subdir_segment(name: str) -> str:
-    """mirror/extracted 下单层子目录名，禁止路径穿越。"""
     n = name.strip()
     if not n or n in (".", "..") or "/" in n or "\\" in n or n.startswith("-"):
         raise ValueError(f"非法子目录名（禁止路径穿越）: {name!r}")
@@ -135,6 +144,20 @@ def _safe_rel_subdir_segment(name: str) -> str:
     return n
 def build_wipe_extracted_cmd(args: argparse.Namespace) -> list[str]:
     """删掉整个 extracted 根目录（并行分片前或单 Job 全量替换前）。"""
     vol = f"hf://buckets/{args.bucket}:/mnt/wjad"
@@ -167,7 +190,7 @@ def build_wipe_extracted_subdirs_cmd(args: argparse.Namespace, subdirs: list[str
     ext_base = f"/mnt/wjad/{args.extracted_subpath.rstrip('/')}"
     rm_parts = [
         f'echo "[wipe-subdir] {ext_base}/{d}" && rm -rf {ext_base}/{d}'
-        for d in (_safe_rel_subdir_segment(s) for s in subdirs)
     ]
     inner = "set -e && " + " && ".join(rm_parts)
     cmd = [
@@ -229,18 +252,25 @@ def build_extract_cmd(
         for _, ed in pairs:
             steps.append(f"mkdir -p {ed}")
     extract_chain = " && ".join(
-        f'echo "[extract] {sd} -> {ed}" && python scripts/jobs_extract_archives.py --scan-root {sd} --out-root {ed}'
         for sd, ed in pairs
     )
     steps.extend(
         [
-            "command -v git >/dev/null 2>&1 || (apt-get update && apt-get install -y --no-install-recommends git)",
-            "pip install -q -U huggingface_hub",
-            f"rm -rf {clone_dir} && git clone https://oauth2:$HF_TOKEN@huggingface.co/{args.code_repo} {clone_dir}",
             f"cd {clone_dir}",
             extract_chain,
         ]
     )
     inner = " && ".join(steps)
@@ -277,12 +307,15 @@ def build_train_cmd(args: argparse.Namespace) -> list[str]:
             f"export WJAD_HUB_REPO={args.weights_repo}",
             f"export WJAD_DATA_ROOT=/mnt/wjad/{args.extracted_subpath}",
             "export WJAD_OUTPUT_DIR=/mnt/wjad/runs/current",
-            "command -v git >/dev/null 2>&1 || (apt-get update && apt-get install -y --no-install-recommends git)",
-            "pip install -q -U huggingface_hub",
-            f"rm -rf {clone_dir} && git clone https://oauth2:$HF_TOKEN@huggingface.co/{args.code_repo} {clone_dir}",
             f"cd {clone_dir}",
-            "pip install -q -U pip",
-            "pip install -q -e .",
             "bash scripts/jobs_entry_train.sh",
         ]
     )
@@ -307,14 +340,33 @@ def build_train_cmd(args: argparse.Namespace) -> list[str]:
     return cmd
-def _load_shard_groups(args: argparse.Namespace) -> list[list[str]]:
-    """CLI ``--shard-dirs a,b`` → 两个 Job；``--shards-file`` 每行一个 Job，行内逗号为同 Job 多目录。"""
-    groups: list[list[str]] = []
     if getattr(args, "shard_dirs", None):
         for p in args.shard_dirs.split(","):
             p = p.strip()
             if p:
-                groups.append([p])
     sf = getattr(args, "shards_file", None)
     if sf:
         raw = Path(sf).expanduser().read_text(encoding="utf-8")
@@ -322,21 +374,22 @@ def _load_shard_groups(args: argparse.Namespace) -> list[list[str]]:
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
-            parts = [x.strip() for x in line.split(",") if x.strip()]
             if parts:
-                groups.append(parts)
-    seen: set[tuple[str, ...]] = set()
-    uniq: list[list[str]] = []
-    for g in groups:
-        t = tuple(g)
         if t not in seen:
             seen.add(t)
-            uniq.append(g)
     return uniq
 def _load_replace_subdirs(args: argparse.Namespace) -> list[str]:
-    """从 CLI / 文件收集要预先删除的 extracted 子目录（单层名）。"""
     raw: list[str] = []
     rs = getattr(args, "replace_subdirs", None)
     if rs:
@@ -354,7 +407,7 @@ def _load_replace_subdirs(args: argparse.Namespace) -> list[str]:
     seen: set[str] = set()
     out: list[str] = []
     for s in raw:
-        seg = _safe_rel_subdir_segment(s)
         if seg not in seen:
             seen.add(seg)
             out.append(seg)
@@ -401,11 +454,12 @@ def run_extract_parallel(args: argparse.Namespace) -> int:
         if _wait_job(jid, label="wipe extracted subdirs") != 0:
             return 1
     jids: list[tuple[str, str]] = []
-    for grp in groups:
         label = ",".join(grp)
         sargs = argparse.Namespace(**vars(args))
         sargs.detach = True
         sargs.replace_extracted = False
         rc, jid = _submit_detach(build_extract_cmd(sargs, shard_group=grp))
         if rc != 0:
             return rc
@@ -567,13 +621,13 @@ def main() -> None:
         "--replace-subdirs",
         default=None,
         dest="replace_subdirs",
-        help="extract-parallel：仅 rm -rf 下列子目录（逗号分隔，相对 extracted-subpath），再提交解压",
     )
     p.add_argument(
         "--replace-subdirs-file",
         default=None,
         dest="replace_subdirs_file",
-        help="extract-parallel：同上，每行一个目录名；# 注释；行内逗号可多个",
     )
     p.add_argument(
         "--shard-dirs",
@@ -585,7 +639,13 @@ def main() -> None:
         "--shards-file",
         default=None,
         dest="shards_file",
-        help="extract-parallel：每行一个 Job；行内逗号分隔多个子目录（同 Job 内顺序解压）；# 开头为注释",
     )
     p.add_argument("--train-image", default=TRAIN_IMAGE, dest="train_image")
     p.add_argument("--train-flavor", default=TRAIN_FLAVOR, dest="train_flavor")

 - ``inspect-extract-job <job_id>`` — 从 ``hf jobs inspect --json`` 中解析 ``--scan-root``，列出该解压 Job 负责的顶层文件夹名。
 - ``extract-parallel`` — 多个并行解压 Job；**合并**为各子目录在 ``extracted/.../<名>/`` 下并列。
   ``--shard-dirs a,b,c`` 表示 **3 个 Job**（各处理一个顶层子文件夹）。
+  ``--shards-file`` 支持每行后缀 ``|only=stem1,stem2``，传给 ``jobs_extract_archives --only-stems``，
+  仅在当次扫描目录内解压这些归档（如仅 ``generation`` 分卷，跳过同目录的 ``caption``/``hdmap``）。
 单独步骤可加 ``--detach``。Windows 上会使用 ``sys.executable`` 同目录的 ``hf.exe``。
 """
 def _bucket_cache_env_sh(cache_subpath: str) -> str:
+    """单行：pip / 临时目录 / HF 相关缓存全部落在挂载盘 cache/ 下。
+    将 HOME / PYTHONPYCACHEPREFIX 也指到 Bucket，避免任务写满容器根分区（ephemeral 50G）。
+    """
     c = f"/mnt/wjad/{cache_subpath}"
     return (
         f"export WJAD_CACHE_ROOT={c} && "
+        f"mkdir -p {c}/pip {c}/tmp {c}/hf {c}/transformers {c}/torch {c}/datasets {c}/xdg "
+        f"{c}/jobhome {c}/pycache && "
+        f"export HOME={c}/jobhome && "
+        f"export PYTHONPYCACHEPREFIX={c}/pycache && "
+        f"export PYTHONDONTWRITEBYTECODE=1 && "
         f"export PIP_CACHE_DIR={c}/pip && export TMPDIR={c}/tmp && export TEMP={c}/tmp && export TMP={c}/tmp && "
         f"export HF_HOME={c}/hf && export TRANSFORMERS_CACHE={c}/transformers && export TORCH_HOME={c}/torch && "
+        f"export HF_DATASETS_CACHE={c}/datasets && export XDG_CACHE_HOME={c}/xdg && "
+        f"export XDG_CONFIG_HOME={c}/xdg_config && export XDG_DATA_HOME={c}/xdg_data"
     )
 def _safe_rel_subdir_segment(name: str) -> str:
+    """单层子目录名（旧逻辑保留，供需单段名的场景）。"""
     n = name.strip()
     if not n or n in (".", "..") or "/" in n or "\\" in n or n.startswith("-"):
         raise ValueError(f"非法子目录名（禁止路径穿越）: {name!r}")
     return n
+def _safe_rel_subpath(rel: str) -> str:
+    """extracted 下相对路径，可含多级，如 ``cosmos_synthetic/single_view/generation``。"""
+    n = rel.strip().strip("/")
+    if not n or n.startswith("-") or ".." in n:
+        raise ValueError(f"非法相对路径（禁止路径穿越）: {rel!r}")
+    segments = [x for x in n.split("/") if x]
+    for seg in segments:
+        if seg in (".", ".."):
+            raise ValueError(f"非法路径段: {rel!r}")
+        if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.-]*", seg):
+            raise ValueError(f"非法路径段（仅允许安全字符）: {rel!r}")
+    return "/".join(segments)
 def build_wipe_extracted_cmd(args: argparse.Namespace) -> list[str]:
     """删掉整个 extracted 根目录（并行分片前或单 Job 全量替换前）。"""
     vol = f"hf://buckets/{args.bucket}:/mnt/wjad"
     ext_base = f"/mnt/wjad/{args.extracted_subpath.rstrip('/')}"
     rm_parts = [
         f'echo "[wipe-subdir] {ext_base}/{d}" && rm -rf {ext_base}/{d}'
+        for d in (_safe_rel_subpath(s) for s in subdirs)
     ]
     inner = "set -e && " + " && ".join(rm_parts)
     cmd = [
         for _, ed in pairs:
             steps.append(f"mkdir -p {ed}")
+    extra = getattr(args, "extract_only_stems", None) or ""
+    extra_arg = f" --only-stems {extra}" if extra else ""
     extract_chain = " && ".join(
+        f'echo "[extract] {sd} -> {ed}" && python scripts/jobs_extract_archives.py --scan-root {sd} --out-root {ed}{extra_arg}'
         for sd, ed in pairs
     )
     steps.extend(
         [
+            "command -v git >/dev/null 2>&1 || (apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*)",
+            "pip install --no-cache-dir -q -U huggingface_hub",
+            (
+                "rm -rf {cd_} && git clone --depth 1 --single-branch "
+                "https://oauth2:$HF_TOKEN@huggingface.co/{repo} {cd_}"
+            ).format(cd_=clone_dir, repo=args.code_repo),
             f"cd {clone_dir}",
             extract_chain,
+            f"rm -rf {clone_dir}",
         ]
     )
     inner = " && ".join(steps)
             f"export WJAD_HUB_REPO={args.weights_repo}",
             f"export WJAD_DATA_ROOT=/mnt/wjad/{args.extracted_subpath}",
             "export WJAD_OUTPUT_DIR=/mnt/wjad/runs/current",
+            "command -v git >/dev/null 2>&1 || (apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*)",
+            "pip install --no-cache-dir -q -U huggingface_hub",
+            (
+                "rm -rf {cd_} && git clone --depth 1 --single-branch "
+                "https://oauth2:$HF_TOKEN@huggingface.co/{repo} {cd_}"
+            ).format(cd_=clone_dir, repo=args.code_repo),
             f"cd {clone_dir}",
+            "pip install --no-cache-dir -q -U pip",
+            "pip install --no-cache-dir -q -e .",
             "bash scripts/jobs_entry_train.sh",
         ]
     )
     return cmd
+def _parse_shards_file_line(line: str) -> tuple[str, str | None]:
+    """``dir|only=generation,caption`` → (dir, generation,caption)；仅 only= 会传给解压脚本。"""
+    base = line
+    only: str | None = None
+    if "|" in line:
+        base, flag = line.split("|", 1)
+        base = base.strip()
+        for piece in flag.split(","):
+            piece = piece.strip()
+            if piece.startswith("only="):
+                val = piece[5:].strip()
+                if not val:
+                    continue
+                if not re.fullmatch(r"[A-Za-z0-9_,-]+", val):
+                    raise ValueError(f"only= 仅允许字母数字、逗号、连字符、下划线: {line!r}")
+                only = val
+    return base.strip(), only
+def _load_shard_groups(args: argparse.Namespace) -> list[tuple[list[str], str | None]]:
+    """``--shard-dirs a,b`` → 两个 Job；文件每行一个 Job；行内逗号多目录；可选 ``|only=…``。"""
+    groups: list[tuple[list[str], str | None]] = []
     if getattr(args, "shard_dirs", None):
         for p in args.shard_dirs.split(","):
             p = p.strip()
             if p:
+                groups.append(([p], None))
     sf = getattr(args, "shards_file", None)
     if sf:
         raw = Path(sf).expanduser().read_text(encoding="utf-8")
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
+            base, only = _parse_shards_file_line(line)
+            parts = [x.strip() for x in base.split(",") if x.strip()]
             if parts:
+                groups.append((parts, only))
+    seen: set[tuple[tuple[str, ...], str | None]] = set()
+    uniq: list[tuple[list[str], str | None]] = []
+    for g, o in groups:
+        t = (tuple(g), o)
         if t not in seen:
             seen.add(t)
+            uniq.append((g, o))
     return uniq
 def _load_replace_subdirs(args: argparse.Namespace) -> list[str]:
+    """从 CLI / 文件收集要预先删除的 extracted 下相对路径（可多级）。"""
     raw: list[str] = []
     rs = getattr(args, "replace_subdirs", None)
     if rs:
     seen: set[str] = set()
     out: list[str] = []
     for s in raw:
+        seg = _safe_rel_subpath(s)
         if seg not in seen:
             seen.add(seg)
             out.append(seg)
         if _wait_job(jid, label="wipe extracted subdirs") != 0:
             return 1
     jids: list[tuple[str, str]] = []
+    for grp, only_opt in groups:
         label = ",".join(grp)
         sargs = argparse.Namespace(**vars(args))
         sargs.detach = True
         sargs.replace_extracted = False
+        sargs.extract_only_stems = only_opt
         rc, jid = _submit_detach(build_extract_cmd(sargs, shard_group=grp))
         if rc != 0:
             return rc
         "--replace-subdirs",
         default=None,
         dest="replace_subdirs",
+        help="extract-parallel：仅 rm -rf 下列路径（逗号分隔，相对 extracted-subpath，可多级如 a/b/c）",
     )
     p.add_argument(
         "--replace-subdirs-file",
         default=None,
         dest="replace_subdirs_file",
+        help="extract-parallel：同上；每行相对 extracted-subpath 的一条路径（可多级）；# 注释",
     )
     p.add_argument(
         "--shard-dirs",
         "--shards-file",
         default=None,
         dest="shards_file",
+        help="extract-parallel：每行一个 Job；行内逗号多目录；可选后缀 |only=stem1,stem2（仅解压这些归档）；# 注释",
+    )
+    p.add_argument(
+        "--extract-only-stems",
+        default=None,
+        dest="extract_only_stems",
+        help="仅 action=extract：传给 jobs_extract_archives --only-stems；parallel 请用 shards 行内 |only=",
     )
     p.add_argument("--train-image", default=TRAIN_IMAGE, dest="train_image")
     p.add_argument("--train-flavor", default=TRAIN_FLAVOR, dest="train_flavor")