#!/usr/bin/env python3 """Package + upload the MapGS/MapTokenGS bundle to a PRIVATE HF repo. Code + checkpoints + README upload directly. The ~98GB of processed clips are ~390k small files (exceeds HF file limits) so they are tarred into <=SHARD_GB shards and uploaded one at a time, deleting each tar after upload to stay within local free disk. Token is read from $HF_TOKEN (never hard-coded).""" import os, glob, tarfile, sys from huggingface_hub import HfApi, create_repo TOKEN = os.environ["HF_TOKEN"] REPO = os.environ.get("HF_REPO", "ChenmingWu/mapgs-maptokengs") ROOT = "/mnt/william" STAGE = "/mnt/william/_upload" SHARD_GB = 18.0 api = HfApi(token=TOKEN) def log(*a): print(*a, flush=True) def dir_size(d): s = 0 for r, _, fs in os.walk(d): for f in fs: try: s += os.path.getsize(os.path.join(r, f)) except OSError: pass return s def main(): create_repo(REPO, private=True, repo_type="model", exist_ok=True, token=TOKEN) log(f"repo ready (private): {REPO}") # 1) README api.upload_file(path_or_fileobj=f"{ROOT}/README_HF.md", path_in_repo="README.md", repo_id=REPO, repo_type="model") log("uploaded README.md") # 2) code (small file counts) — preserve structure, skip caches/git api.upload_folder(folder_path=ROOT, repo_id=REPO, repo_type="model", path_in_repo=".", allow_patterns=["mapgs/**", "maptokengs/**", "scripts/**", "_tokengs_repo/**"], ignore_patterns=["**/__pycache__/**", "**/*.pyc", "**/.git/**", "**/*.tar", "_tokengs_repo/assets/**", "**/*.gif"]) # skip git-lfs pointer assets log("uploaded code (mapgs, maptokengs, scripts, _tokengs_repo)") # 3) checkpoints (skip the still-being-written fixed-run files; added later) cks = [p for p in sorted(glob.glob(f"{ROOT}/runs/*.safetensors")) if "maptokengs_fixed" not in os.path.basename(p)] for p in cks: api.upload_file(path_or_fileobj=p, path_in_repo=f"runs/{os.path.basename(p)}", repo_id=REPO, repo_type="model") log("uploaded ckpt", os.path.basename(p)) # 4) data: pack clip dirs into <=SHARD_GB tars, upload + delete each os.makedirs(STAGE, exist_ok=True) clip_dirs = sorted(d for d in glob.glob(f"{ROOT}/data/unified/*/*/*") if os.path.isdir(d)) log(f"clip dirs: {len(clip_dirs)} — computing sizes / packing into shards") shards, cur, cur_sz, cap = [], [], 0, SHARD_GB * 1e9 for d in clip_dirs: sz = dir_size(d) if cur and cur_sz + sz > cap: shards.append(cur); cur, cur_sz = [], 0 cur.append(d); cur_sz += sz if cur: shards.append(cur) log(f"{len(shards)} data shards") for i, dirs in enumerate(shards): tarp = f"{STAGE}/data_shard_{i:03d}.tar" with tarfile.open(tarp, "w") as tf: for d in dirs: tf.add(d, arcname=os.path.relpath(d, ROOT)) gb = os.path.getsize(tarp) / 1e9 api.upload_file(path_or_fileobj=tarp, path_in_repo=f"data/data_shard_{i:03d}.tar", repo_id=REPO, repo_type="model") os.remove(tarp) log(f"uploaded data_shard_{i:03d}.tar ({gb:.1f} GB, {len(dirs)} clips) [{i+1}/{len(shards)}]") log("=== UPLOAD COMPLETE ===") if __name__ == "__main__": main()