GenSeg-Baselines / code /scripts /hf_upload_tars.py
MaybeRichard's picture
code: complete eval pipeline (7 metrics + per-class + Wilcoxon) + Swin-UNet/TransUNet networks; remove backups/obsolete
1a18f22 verified
Raw
History Blame Contribute Delete
1.48 kB
"""Recovery upload: ship GenSegDataset as ONE tar per subset (10 files) instead of
~110k loose PNGs, to stay under HF's 128-commit/hour limit. Resets the partially
populated repo, uploads <subset>.tar + the dataset card."""
import os, subprocess
from huggingface_hub import HfApi
REPO = "MaybeRichard/GenSegDataset"
DATA = "/home/wzhang/LSC/Dataset/Segmentation/processed_unified"
TARS = "/home/wzhang/LSC/Dataset/Segmentation/hf_tars"
CARD = "/home/wzhang/LSC/Dataset/Segmentation/GenSegDataset_README.md"
os.makedirs(TARS, exist_ok=True)
for ds in sorted(os.listdir(DATA)):
if not os.path.isdir(os.path.join(DATA, ds)):
continue
out = os.path.join(TARS, ds + ".tar")
if os.path.exists(out) and os.path.getsize(out) > 0:
print("skip (exists):", ds, flush=True); continue
# -h dereferences symlinks so fold-shared images are materialized into the tar
subprocess.run(["tar", "-chf", out, "-C", DATA, ds], check=True)
print("tarred %s -> %.1f MB" % (ds, os.path.getsize(out) / 1e6), flush=True)
api = HfApi()
api.delete_repo(REPO, repo_type="dataset", missing_ok=True)
api.create_repo(REPO, repo_type="dataset", private=True, exist_ok=True)
print("repo reset:", REPO, flush=True)
api.upload_file(path_or_fileobj=CARD, path_in_repo="README.md",
repo_id=REPO, repo_type="dataset", commit_message="dataset card")
api.upload_large_folder(repo_id=REPO, repo_type="dataset", folder_path=TARS)
print("UPLOAD_DONE", flush=True)