| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import os |
| import os.path as osp |
| import re |
| import time |
| from hashlib import sha1, sha256 |
|
|
| from huggingface_hub import HfApi |
| from huggingface_hub.hf_api import CommitOperationAdd |
| from termcolor import colored |
|
|
| MAX_UPLOAD_FILES_PER_COMMIT = 64 |
| MAX_UPLOAD_SIZE_PER_COMMIT = 64 * 1024 * 1024 * 1024 |
| MAX_UPLOAD_SIZE_PER_SINGLE_FILE = 45 * 1024 * 1024 * 1024 |
|
|
|
|
| def compute_git_hash(filename): |
| with open(filename, "rb") as f: |
| data = f.read() |
| s = "blob %u\0" % len(data) |
| h = sha1() |
| h.update(s.encode("utf-8")) |
| h.update(data) |
| return h.hexdigest() |
|
|
|
|
| def compute_lfs_hash(filename): |
| with open(filename, "rb") as f: |
| data = f.read() |
| h = sha256() |
| h.update(data) |
| return h.hexdigest() |
|
|
|
|
| def main(): |
| import os |
|
|
| os.environ["CURL_CA_BUNDLE"] = "" |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("local_folder", type=str) |
| parser.add_argument("--model-name", type=str, default=None) |
| parser.add_argument("--repo-type", type=str, choices=["model", "dataset"]) |
| parser.add_argument("--repo-org", type=str, default="Efficient-Large-Model") |
| parser.add_argument("--repo-id", type=str, default=None) |
| parser.add_argument("--root-dir", type=str, default=None) |
| parser.add_argument("--token", type=str, default=None) |
|
|
| parser.add_argument("-e", "--exclude", action="append", default=[r"checkpoint-[\d]*/.*", ".git/.*", "wandb/.*"]) |
| parser.add_argument("--fast-check", action="store_true") |
| parser.add_argument("--sleep-on-error", action="store_true") |
|
|
| args = parser.parse_args() |
|
|
| if args.token is None: |
| api = HfApi() |
| else: |
| print("initing using token from cmd args.") |
| api = HfApi(token=args.token) |
|
|
| repo_type = args.repo_type |
|
|
| local_folder = args.local_folder |
|
|
| if args.repo_id is not None: |
| repo = args.repo_id |
| else: |
| |
| if local_folder[-1] == "/": |
| local_folder = local_folder[:-1] |
|
|
| if args.model_name is None: |
| model_name = osp.basename(local_folder).replace("+", "-") |
| else: |
| model_name = args.model_name |
| repo = osp.join(args.repo_org, model_name) |
|
|
| local_folder = os.path.expanduser(local_folder) |
| root_dir = local_folder if args.root_dir is None else args.root_dir |
| print(f"uploading {local_folder} to {repo}") |
| if not api.repo_exists(repo, repo_type=repo_type): |
| api.create_repo( |
| repo_id=repo, |
| private=True, |
| repo_type=repo_type, |
| ) |
|
|
| BASE_URL = "https://hf.co" |
| if args.repo_type == "dataset": |
| BASE_URL = "https://hf.co/datasets" |
| print(colored(f"Uploading {osp.join(BASE_URL, repo)}", "green")) |
|
|
| ops = [] |
| commit_description = "" |
| commit_size = 0 |
| for root, dirs, files in os.walk(local_folder, topdown=True): |
| dirs.sort() |
| for name in files: |
| fpath = osp.join(root, name) |
| rpath = osp.relpath(fpath, osp.abspath(root_dir)) |
|
|
| exclude_flag = False |
| matched_pattern = None |
| for pattern in args.exclude: |
| if re.search(pattern, rpath): |
| exclude_flag = True |
| matched_pattern = pattern |
| if exclude_flag: |
| print(colored(f"""[regex filter-out][{matched_pattern}]: {rpath}, skipping""", "yellow")) |
| continue |
|
|
| if osp.getsize(fpath) > MAX_UPLOAD_SIZE_PER_SINGLE_FILE: |
| print( |
| colored( |
| f"Huggingface only supports filesize less than {MAX_UPLOAD_SIZE_PER_SINGLE_FILE}, skipping", |
| "red", |
| ) |
| ) |
| continue |
|
|
| if api.file_exists(repo_id=repo, filename=rpath, repo_type=repo_type): |
| if args.fast_check: |
| print( |
| colored( |
| f"Already uploaded {rpath}, fast check pass, skipping", |
| "green", |
| ) |
| ) |
| continue |
| else: |
| hf_meta = api.get_paths_info(repo_id=repo, paths=rpath, repo_type=repo_type)[0] |
|
|
| if hf_meta.lfs is not None: |
| hash_type = "lfs-sha256" |
| hf_hash = hf_meta.lfs["sha256"] |
| git_hash = compute_lfs_hash(fpath) |
| else: |
| hash_type = "git-sha1" |
| hf_hash = hf_meta.blob_id |
| git_hash = compute_git_hash(fpath) |
|
|
| if hf_hash == git_hash: |
| print( |
| colored( |
| f"Already uploaded {rpath}, hash check pass, skipping", |
| "green", |
| ) |
| ) |
| continue |
| else: |
| print( |
| colored( |
| f"{rpath} is not same as local version, re-uploading...", |
| "red", |
| ) |
| ) |
|
|
| operation = CommitOperationAdd( |
| path_or_fileobj=fpath, |
| path_in_repo=rpath, |
| ) |
| print(colored(f"Commiting {rpath}", "green")) |
| ops.append(operation) |
| commit_size += operation.upload_info.size |
| commit_description += f"Upload {rpath}\n" |
| if len(ops) <= MAX_UPLOAD_FILES_PER_COMMIT and commit_size <= MAX_UPLOAD_SIZE_PER_COMMIT: |
| continue |
|
|
| commit_message = "Upload files with vila-upload." |
| result = None |
| while result is None: |
| try: |
| commit_info = api.create_commit( |
| repo_id=repo, |
| repo_type=repo_type, |
| operations=ops, |
| commit_message=commit_message, |
| commit_description=commit_description, |
| ) |
| except RuntimeError as e: |
| print(e) |
| if not args.sleep_on_error: |
| raise e |
| else: |
| print("sleeping for one hour then re-try") |
| time.sleep(1800) |
| continue |
| result = "success" |
|
|
| commit_description = "" |
| ops = [] |
| commit_size = 0 |
|
|
| print(colored(f"Finish {commit_info}", "yellow")) |
|
|
| |
| commit_message = "Upload files with `sana-upload`." |
| commit_info = api.create_commit( |
| repo_id=repo, |
| repo_type=repo_type, |
| operations=ops, |
| commit_message=commit_message, |
| commit_description=commit_description, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|