mapvggt / scripts /upload_hf.py
ChenmingWu's picture
Upload folder using huggingface_hub
8cf92b3 verified
Raw
History Blame Contribute Delete
3.39 kB
#!/usr/bin/env python3
"""Package + upload the MapGS/MapTokenGS bundle to a PRIVATE HF repo.
Code + checkpoints + README upload directly. The ~98GB of processed clips are
~390k small files (exceeds HF file limits) so they are tarred into <=SHARD_GB
shards and uploaded one at a time, deleting each tar after upload to stay within
local free disk. Token is read from $HF_TOKEN (never hard-coded)."""
import os, glob, tarfile, sys
from huggingface_hub import HfApi, create_repo
TOKEN = os.environ["HF_TOKEN"]
REPO = os.environ.get("HF_REPO", "ChenmingWu/mapgs-maptokengs")
ROOT = "/mnt/william"
STAGE = "/mnt/william/_upload"
SHARD_GB = 18.0
api = HfApi(token=TOKEN)
def log(*a):
print(*a, flush=True)
def dir_size(d):
s = 0
for r, _, fs in os.walk(d):
for f in fs:
try: s += os.path.getsize(os.path.join(r, f))
except OSError: pass
return s
def main():
create_repo(REPO, private=True, repo_type="model", exist_ok=True, token=TOKEN)
log(f"repo ready (private): {REPO}")
# 1) README
api.upload_file(path_or_fileobj=f"{ROOT}/README_HF.md", path_in_repo="README.md",
repo_id=REPO, repo_type="model")
log("uploaded README.md")
# 2) code (small file counts) — preserve structure, skip caches/git
api.upload_folder(folder_path=ROOT, repo_id=REPO, repo_type="model", path_in_repo=".",
allow_patterns=["mapgs/**", "maptokengs/**", "scripts/**", "_tokengs_repo/**"],
ignore_patterns=["**/__pycache__/**", "**/*.pyc", "**/.git/**", "**/*.tar",
"_tokengs_repo/assets/**", "**/*.gif"]) # skip git-lfs pointer assets
log("uploaded code (mapgs, maptokengs, scripts, _tokengs_repo)")
# 3) checkpoints (skip the still-being-written fixed-run files; added later)
cks = [p for p in sorted(glob.glob(f"{ROOT}/runs/*.safetensors"))
if "maptokengs_fixed" not in os.path.basename(p)]
for p in cks:
api.upload_file(path_or_fileobj=p, path_in_repo=f"runs/{os.path.basename(p)}",
repo_id=REPO, repo_type="model")
log("uploaded ckpt", os.path.basename(p))
# 4) data: pack clip dirs into <=SHARD_GB tars, upload + delete each
os.makedirs(STAGE, exist_ok=True)
clip_dirs = sorted(d for d in glob.glob(f"{ROOT}/data/unified/*/*/*") if os.path.isdir(d))
log(f"clip dirs: {len(clip_dirs)} — computing sizes / packing into shards")
shards, cur, cur_sz, cap = [], [], 0, SHARD_GB * 1e9
for d in clip_dirs:
sz = dir_size(d)
if cur and cur_sz + sz > cap:
shards.append(cur); cur, cur_sz = [], 0
cur.append(d); cur_sz += sz
if cur: shards.append(cur)
log(f"{len(shards)} data shards")
for i, dirs in enumerate(shards):
tarp = f"{STAGE}/data_shard_{i:03d}.tar"
with tarfile.open(tarp, "w") as tf:
for d in dirs:
tf.add(d, arcname=os.path.relpath(d, ROOT))
gb = os.path.getsize(tarp) / 1e9
api.upload_file(path_or_fileobj=tarp, path_in_repo=f"data/data_shard_{i:03d}.tar",
repo_id=REPO, repo_type="model")
os.remove(tarp)
log(f"uploaded data_shard_{i:03d}.tar ({gb:.1f} GB, {len(dirs)} clips) [{i+1}/{len(shards)}]")
log("=== UPLOAD COMPLETE ===")
if __name__ == "__main__":
main()