File size: 3,389 Bytes
8cf92b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
"""Package + upload the MapGS/MapTokenGS bundle to a PRIVATE HF repo.

Code + checkpoints + README upload directly. The ~98GB of processed clips are
~390k small files (exceeds HF file limits) so they are tarred into <=SHARD_GB
shards and uploaded one at a time, deleting each tar after upload to stay within
local free disk. Token is read from $HF_TOKEN (never hard-coded)."""
import os, glob, tarfile, sys
from huggingface_hub import HfApi, create_repo

TOKEN = os.environ["HF_TOKEN"]
REPO = os.environ.get("HF_REPO", "ChenmingWu/mapgs-maptokengs")
ROOT = "/mnt/william"
STAGE = "/mnt/william/_upload"
SHARD_GB = 18.0
api = HfApi(token=TOKEN)


def log(*a):
    print(*a, flush=True)


def dir_size(d):
    s = 0
    for r, _, fs in os.walk(d):
        for f in fs:
            try: s += os.path.getsize(os.path.join(r, f))
            except OSError: pass
    return s


def main():
    create_repo(REPO, private=True, repo_type="model", exist_ok=True, token=TOKEN)
    log(f"repo ready (private): {REPO}")

    # 1) README
    api.upload_file(path_or_fileobj=f"{ROOT}/README_HF.md", path_in_repo="README.md",
                    repo_id=REPO, repo_type="model")
    log("uploaded README.md")

    # 2) code (small file counts) — preserve structure, skip caches/git
    api.upload_folder(folder_path=ROOT, repo_id=REPO, repo_type="model", path_in_repo=".",
                      allow_patterns=["mapgs/**", "maptokengs/**", "scripts/**", "_tokengs_repo/**"],
                      ignore_patterns=["**/__pycache__/**", "**/*.pyc", "**/.git/**", "**/*.tar",
                                       "_tokengs_repo/assets/**", "**/*.gif"])  # skip git-lfs pointer assets
    log("uploaded code (mapgs, maptokengs, scripts, _tokengs_repo)")

    # 3) checkpoints (skip the still-being-written fixed-run files; added later)
    cks = [p for p in sorted(glob.glob(f"{ROOT}/runs/*.safetensors"))
           if "maptokengs_fixed" not in os.path.basename(p)]
    for p in cks:
        api.upload_file(path_or_fileobj=p, path_in_repo=f"runs/{os.path.basename(p)}",
                        repo_id=REPO, repo_type="model")
        log("uploaded ckpt", os.path.basename(p))

    # 4) data: pack clip dirs into <=SHARD_GB tars, upload + delete each
    os.makedirs(STAGE, exist_ok=True)
    clip_dirs = sorted(d for d in glob.glob(f"{ROOT}/data/unified/*/*/*") if os.path.isdir(d))
    log(f"clip dirs: {len(clip_dirs)} — computing sizes / packing into shards")
    shards, cur, cur_sz, cap = [], [], 0, SHARD_GB * 1e9
    for d in clip_dirs:
        sz = dir_size(d)
        if cur and cur_sz + sz > cap:
            shards.append(cur); cur, cur_sz = [], 0
        cur.append(d); cur_sz += sz
    if cur: shards.append(cur)
    log(f"{len(shards)} data shards")

    for i, dirs in enumerate(shards):
        tarp = f"{STAGE}/data_shard_{i:03d}.tar"
        with tarfile.open(tarp, "w") as tf:
            for d in dirs:
                tf.add(d, arcname=os.path.relpath(d, ROOT))
        gb = os.path.getsize(tarp) / 1e9
        api.upload_file(path_or_fileobj=tarp, path_in_repo=f"data/data_shard_{i:03d}.tar",
                        repo_id=REPO, repo_type="model")
        os.remove(tarp)
        log(f"uploaded data_shard_{i:03d}.tar ({gb:.1f} GB, {len(dirs)} clips) [{i+1}/{len(shards)}]")

    log("=== UPLOAD COMPLETE ===")


if __name__ == "__main__":
    main()