File size: 10,001 Bytes

#!/usr/bin/env python3
"""
Upload trained artifacts to Hugging Face Hub.

This repo uses local-path inference. The upload is intended so you can later
download these directories into the same folder layout and run inference.
"""

from __future__ import annotations

import argparse
import os
import shutil
import sys
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path

LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.")
    parser.add_argument(
        "--repo-id",
        required=True,
        help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.",
    )
    parser.add_argument(
        "--token",
        default=os.environ.get("HF_TOKEN"),
        help="HF token. If omitted, uses env HF_TOKEN.",
    )
    parser.add_argument(
        "--private",
        action="store_true",
        help="Create the repo as private.",
    )
    parser.add_argument(
        "--include-multitask",
        action="store_true",
        help="Upload multitask intent model output directory.",
    )
    parser.add_argument(
        "--include-iab",
        action="store_true",
        help="Upload IAB classifier model output directory.",
    )
    parser.add_argument(
        "--include-calibration",
        action="store_true",
        help="Upload artifacts/calibration directory.",
    )
    parser.add_argument(
        "--include-hf-readme",
        action="store_true",
        help="Upload a Hugging Face model card file as README.md in the Hub repo root.",
    )
    parser.add_argument(
        "--include-serving-code",
        action="store_true",
        help="Upload core runtime Python/code files required for Hub trust_remote_code inference.",
    )
    parser.add_argument(
        "--include-root-checkpoint",
        action="store_true",
        help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.",
    )
    parser.add_argument(
        "--include-all",
        action="store_true",
        help=(
            "Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + "
            "HF README + serving code + root checkpoint/tokenizer files."
        ),
    )
    parser.add_argument(
        "--hf-readme-path",
        default="HF_MODEL_CARD.md",
        help="Local path to the HF model card markdown to upload as README.md (relative to repo root).",
    )
    parser.add_argument(
        "--multitask-dir",
        default="multitask_intent_model_output",
        help="Path to multitask intent output directory (relative to this script's base).",
    )
    parser.add_argument(
        "--iab-dir",
        default="iab_classifier_model_output",
        help="Path to IAB classifier model output directory (relative to this script's base).",
    )
    parser.add_argument(
        "--calibration-dir",
        default="artifacts/calibration",
        help="Path to calibration artifacts directory (relative to this script's base).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print what would be uploaded without actually uploading.",
    )
    return parser.parse_args()


def _iter_local_files(path: Path) -> list[Path]:
    if path.is_file():
        return [path]
    return sorted(p for p in path.rglob("*") if p.is_file())


def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]:
    if local_path.is_file():
        return [path_in_repo]
    return [
        f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}"
        for file_path in _iter_local_files(local_path)
    ]


def _requires_large_upload(local_path: Path) -> bool:
    return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path))


def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None:
    with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir:
        staging_root = Path(tmp_dir)
        staged_target = staging_root / repo_path
        staged_target.parent.mkdir(parents=True, exist_ok=True)
        if local_path.is_file():
            shutil.copy2(local_path, staged_target)
        else:
            shutil.copytree(
                local_path,
                staged_target,
                ignore=shutil.ignore_patterns(".cache", "__pycache__"),
            )
        # Ensure resumable-upload metadata from previous local attempts does not
        # get carried into the fresh staging directory.
        shutil.rmtree(staged_target / ".cache", ignore_errors=True)
        api.upload_large_folder(
            repo_id=repo_id,
            repo_type="model",
            folder_path=str(staging_root),
            print_report=False,
        )


def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None:
    expected = set(_remote_file_paths(repo_path, local_path))
    for attempt in range(4):
        files = set(api.list_repo_files(repo_id=repo_id, repo_type="model"))
        missing = sorted(expected - files)
        if not missing:
            return
        if attempt == 3:
            raise RuntimeError(
                "Upload completed but the following remote files are still missing: "
                + ", ".join(missing[:20])
            )
        time.sleep(2 * (attempt + 1))


def main() -> int:
    started_at = time.perf_counter()
    started_wall = datetime.now(timezone.utc).isoformat()
    args = _parse_args()
    if not args.token:
        print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr)
        return 2

    repo_root = Path(__file__).resolve().parent.parent

    multitask_dir = (repo_root / args.multitask_dir).resolve()
    iab_dir = (repo_root / args.iab_dir).resolve()
    calibration_dir = (repo_root / args.calibration_dir).resolve()
    hf_readme_path = (repo_root / args.hf_readme_path).resolve()

    if args.include_all:
        args.include_multitask = True
        args.include_iab = True
        args.include_calibration = True
        args.include_hf_readme = True
        args.include_serving_code = True
        args.include_root_checkpoint = True

    to_upload: list[tuple[str, Path]] = []
    if args.include_multitask:
        to_upload.append(("multitask_intent_model_output", multitask_dir))
    if args.include_iab:
        to_upload.append(("iab_classifier_model_output", iab_dir))
    if args.include_calibration:
        to_upload.append(("artifacts/calibration", calibration_dir))
    if args.include_hf_readme:
        to_upload.append(("README.md", hf_readme_path))

    if args.include_serving_code:
        # Files needed by trust_remote_code execution path.
        for rel in [
            "pipeline.py",
            "config.py",
            "config.json",
            "combined_inference.py",
            "model_runtime.py",
            "multitask_runtime.py",
            "multitask_model.py",
            "schemas.py",
            "inference_intent_type.py",
            "inference_subtype.py",
            "inference_decision_phase.py",
            "inference_iab_classifier.py",
            "iab_classifier.py",
            "iab_taxonomy.py",
        ]:
            to_upload.append((rel, (repo_root / rel).resolve()))

    if args.include_root_checkpoint:
        for rel in [
            "model.safetensors",
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "vocab.txt",
        ]:
            to_upload.append((rel, (repo_root / rel).resolve()))

    if not to_upload:
        print(
            "Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: "
            "--include-multitask --include-iab --include-calibration --include-hf-readme "
            "--include-serving-code --include-root-checkpoint.",
            file=sys.stderr,
        )
        return 2

    # Import lazily so `--dry-run` works without extra deps.
    try:
        from huggingface_hub import HfApi
    except ModuleNotFoundError:
        print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr)
        return 2

    api = HfApi(token=args.token)
    api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True)

    for repo_path, local_dir in to_upload:
        if not local_dir.exists():
            print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr)
            continue
        if args.dry_run:
            print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}")
            continue
        step_start = time.perf_counter()
        mode = "large-folder" if _requires_large_upload(local_dir) else "standard"
        print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})")
        if mode == "large-folder":
            _upload_via_large_folder(api, args.repo_id, repo_path, local_dir)
        elif local_dir.is_file():
            api.upload_file(
                repo_id=args.repo_id,
                repo_type="model",
                path_or_fileobj=str(local_dir),
                path_in_repo=repo_path,
            )
        else:
            api.upload_folder(
                repo_id=args.repo_id,
                repo_type="model",
                folder_path=str(local_dir),
                path_in_repo=repo_path,
            )
        _verify_remote_upload(api, args.repo_id, repo_path, local_dir)
        print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s")

    ended_wall = datetime.now(timezone.utc).isoformat()
    elapsed_s = time.perf_counter() - started_at
    print(f"Upload complete.\nstart: {started_wall}\nend:   {ended_wall}\ntotal: {elapsed_s:.2f}s")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())