"""Recovery upload: ship GenSegDataset as ONE tar per subset (10 files) instead of ~110k loose PNGs, to stay under HF's 128-commit/hour limit. Resets the partially populated repo, uploads .tar + the dataset card.""" import os, subprocess from huggingface_hub import HfApi REPO = "MaybeRichard/GenSegDataset" DATA = "/home/wzhang/LSC/Dataset/Segmentation/processed_unified" TARS = "/home/wzhang/LSC/Dataset/Segmentation/hf_tars" CARD = "/home/wzhang/LSC/Dataset/Segmentation/GenSegDataset_README.md" os.makedirs(TARS, exist_ok=True) for ds in sorted(os.listdir(DATA)): if not os.path.isdir(os.path.join(DATA, ds)): continue out = os.path.join(TARS, ds + ".tar") if os.path.exists(out) and os.path.getsize(out) > 0: print("skip (exists):", ds, flush=True); continue # -h dereferences symlinks so fold-shared images are materialized into the tar subprocess.run(["tar", "-chf", out, "-C", DATA, ds], check=True) print("tarred %s -> %.1f MB" % (ds, os.path.getsize(out) / 1e6), flush=True) api = HfApi() api.delete_repo(REPO, repo_type="dataset", missing_ok=True) api.create_repo(REPO, repo_type="dataset", private=True, exist_ok=True) print("repo reset:", REPO, flush=True) api.upload_file(path_or_fileobj=CARD, path_in_repo="README.md", repo_id=REPO, repo_type="dataset", commit_message="dataset card") api.upload_large_folder(repo_id=REPO, repo_type="dataset", folder_path=TARS) print("UPLOAD_DONE", flush=True)