"""On h800: download GenSegDataset tars from HF (via proxy+token), extract into Data/, then remove the tars. Produces the processed_unified layout under Data/.""" import os, glob, tarfile from huggingface_hub import snapshot_download BASE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Data" TARS = os.path.join(BASE, "_tars") print("[1] downloading tars ...", flush=True) snapshot_download("MaybeRichard/GenSegDataset", repo_type="dataset", allow_patterns=["*.tar", "README.md"], local_dir=TARS) print("[2] extracting ...", flush=True) for t in sorted(glob.glob(os.path.join(TARS, "*.tar"))): print(" extract", os.path.basename(t), flush=True) with tarfile.open(t) as tf: tf.extractall(BASE) rd = os.path.join(TARS, "README.md") if os.path.isfile(rd): os.replace(rd, os.path.join(BASE, "README.md")) print("[3] cleanup tars ...", flush=True) for t in glob.glob(os.path.join(TARS, "*.tar")): os.remove(t) try: os.rmdir(TARS) except OSError: pass print("DONE_DATA", flush=True)